From cb949fc94b72fc70a9822db94d7ba92b38d2dfd3 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Fri, 20 Dec 2024 18:00:05 +1100 Subject: [PATCH] refactor: add inference manifest template and generation script --- .github/e2e-preset-configs.json | 180 +++++++++++++++--- .github/workflows/e2e-preset-test.yml | 102 ++++------ .../falcon-40b-instruct-service.yaml | 13 -- .../falcon-40b-instruct_hf.yaml | 56 ------ .../falcon-40b-instruct_vllm.yaml | 56 ------ .../falcon-40b/falcon-40b-service.yaml | 13 -- .../manifests/falcon-40b/falcon-40b_hf.yaml | 56 ------ .../manifests/falcon-40b/falcon-40b_vllm.yaml | 56 ------ .../falcon-7b-instruct-service.yaml | 13 -- .../falcon-7b-instruct_hf.yaml | 56 ------ .../falcon-7b-instruct_vllm.yaml | 56 ------ .../falcon-7b/falcon-7b-service.yaml | 13 -- .../manifests/falcon-7b/falcon-7b_hf.yaml | 56 ------ .../manifests/falcon-7b/falcon-7b_vllm.yaml | 56 ------ .../manifests/inference-tmpl/manifest.yaml | 100 ++++++++++ .../llama-2-13b-chat-service.yaml | 19 -- .../llama-2-13b-chat/llama-2-13b-chat.yaml | 28 ++- .../llama-2-13b/llama-2-13b-service.yaml | 19 -- .../manifests/llama-2-13b/llama-2-13b.yaml | 26 ++- .../llama-2-7b-chat-service.yaml | 14 -- .../llama-2-7b-chat/llama-2-7b-chat.yaml | 17 +- .../llama-2-7b/llama-2-7b-service.yaml | 14 -- .../test/manifests/llama-2-7b/llama-2-7b.yaml | 17 +- .../mistral-7b-instruct-service.yaml | 13 -- .../mistral-7b-instruct_hf.yaml | 55 ------ .../mistral-7b-instruct_vllm.yaml | 55 ------ .../mistral-7b/mistral-7b-service.yaml | 13 -- .../manifests/mistral-7b/mistral-7b_hf.yaml | 55 ------ .../manifests/mistral-7b/mistral-7b_vllm.yaml | 55 ------ .../test/manifests/phi-2/phi-2-service.yaml | 13 -- .../test/manifests/phi-2/phi-2_hf.yaml | 55 ------ .../test/manifests/phi-2/phi-2_vllm.yaml | 55 ------ .../phi-3-medium-128k-instruct-service.yaml | 13 -- .../phi-3-medium-128k-instruct_hf.yaml | 55 ------ .../phi-3-medium-128k-instruct_vllm.yaml | 55 ------ .../phi-3-medium-4k-instruct-service.yaml | 13 -- .../phi-3-medium-4k-instruct_hf.yaml | 55 ------ .../phi-3-medium-4k-instruct_vllm.yaml | 55 ------ .../phi-3-mini-128k-instruct-service.yaml | 13 -- .../phi-3-mini-128k-instruct_hf.yaml | 55 ------ .../phi-3-mini-128k-instruct_vllm.yaml | 55 ------ .../phi-3-mini-4k-instruct-service.yaml | 13 -- .../phi-3-mini-4k-instruct_hf.yaml | 55 ------ .../phi-3-mini-4k-instruct_vllm.yaml | 55 ------ .../phi-3-small-128k-instruct-service.yaml | 13 -- .../phi-3-small-128k-instruct.yaml | 55 ------ .../phi-3-small-8k-instruct-service.yaml | 13 -- .../phi-3-small-8k-instruct.yaml | 55 ------ .../qwen2-5-coder-7b-instruct-service.yaml | 13 -- .../qwen2-5-coder-7b-instruct_hf.yaml | 55 ------ .../qwen2-5-coder-7b-instruct_vllm.yaml | 83 -------- .../manifests/tuning/falcon/falcon-7b.yaml | 103 ---------- .../tuning/tuning.yaml} | 0 presets/workspace/test/scripts/README.md | 82 ++++++++ .../test/scripts/generate_manifests.py | 112 +++++++++++ 55 files changed, 564 insertions(+), 1917 deletions(-) delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml create mode 100644 presets/workspace/test/manifests/inference-tmpl/manifest.yaml delete mode 100644 presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml delete mode 100644 presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml delete mode 100644 presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml delete mode 100644 presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2_hf.yaml delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml delete mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml delete mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml delete mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml delete mode 100644 presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml rename presets/workspace/test/{tuning/tuning-job.yaml => manifests/tuning/tuning.yaml} (100%) create mode 100644 presets/workspace/test/scripts/README.md create mode 100644 presets/workspace/test/scripts/generate_manifests.py diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index 9d20b0e38..37f49667a 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -7,16 +7,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false - }, - { - "name": "falcon-7b-adapter", - "node-count": 1, - "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100, - "OSS": true, - "loads_adapter": true, - "expected_adapter": "amod-mental-health" + "loads_adapter": false, + "node_pool": "falcon7b", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja", + "gpu_count": 1 + } + } }, { "name": "falcon-7b-instruct", @@ -24,7 +26,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "falcon7binst", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2", + "gpu_count": 2 + } + } }, { "name": "falcon-40b", @@ -32,7 +45,18 @@ "node-vm-size": "Standard_NC48ads_A100_v4", "node-osdisk-size": 400, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "falcon40b", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 2 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2", + "gpu_count": 2 + } + } }, { "name": "falcon-40b-instruct", @@ -40,7 +64,18 @@ "node-vm-size": "Standard_NC48ads_A100_v4", "node-osdisk-size": 400, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "falcon40bins", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 2 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2", + "gpu_count": 2 + } + } }, { "name": "mistral-7b", @@ -48,7 +83,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "mistral7b", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja", + "gpu_count": 1 + } + } }, { "name": "mistral-7b-instruct", @@ -56,7 +102,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "mistral7bins", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16", + "gpu_count": 1 + } + } }, { "name": "phi-2", @@ -64,7 +121,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 50, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "phi2", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16", + "gpu_count": 1 + } + } }, { "name": "phi-3-mini-4k-instruct", @@ -72,7 +140,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 50, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "phi3mini4kin", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16", + "gpu_count": 1 + } + } }, { "name": "phi-3-mini-128k-instruct", @@ -80,7 +159,18 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 50, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "phi3mini128k", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16", + "gpu_count": 1 + } + } }, { "name": "phi-3-medium-4k-instruct", @@ -88,7 +178,18 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "phi3medium4k", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --tensor-parallel-size 2", + "gpu_count": 2 + } + } }, { "name": "phi-3-medium-128k-instruct", @@ -96,7 +197,18 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "phi3medium12", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --max-model-len 1024 --tensor-parallel-size 2", + "gpu_count": 2 + } + } }, { "name": "qwen2.5-coder-7b-instruct", @@ -105,7 +217,18 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "qwen25coder7", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml --tensor-parallel-size 2", + "gpu_count": 2 + } + } }, { "name": "llama-2-7b", @@ -113,7 +236,8 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 100, "OSS": false, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "llama27b" }, { "name": "llama-2-7b-chat", @@ -121,7 +245,8 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 100, "OSS": false, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "llama27bchat" }, { "name": "llama-2-13b", @@ -129,7 +254,8 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 150, "OSS": false, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "llama213b" }, { "name": "llama-2-13b-chat", @@ -137,7 +263,8 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 150, "OSS": false, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "llama213bchat" }, { "name": "tuning", @@ -145,7 +272,8 @@ "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, - "loads_adapter": false + "loads_adapter": false, + "node_pool": "tuning" } ] } diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 123bc1436..d859308d8 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -107,6 +107,7 @@ jobs: environment: preset-env strategy: fail-fast: false + max-parallel: 10 matrix: # Ex matrix element: # {"name":"falcon-40b","type":"text-generation","version":"#", @@ -164,44 +165,49 @@ jobs: - name: Set up kubectl context run: | az aks get-credentials --resource-group llm-test --name GitRunner - - - name: Get Nodepool Name - id: get_nodepool_name + + - name: Get testing workload + id: workload run: | - NAME_SUFFIX=${{ matrix.model.name }} - NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols - NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols + NODEPOOL_NAME=${{ matrix.model.node_pool }} + echo "NODEPOOL_NAME=$NODEPOOL_NAME" >> $GITHUB_OUTPUT - if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then - TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12} - else - TRUNCATED_NAME_SUFFIX=$NAME_SUFFIX_WITHOUT_DASHES - fi - echo "Nodepool Name: $TRUNCATED_NAME_SUFFIX" - echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT + WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }} + echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT + WORKLOAD_FILE=$WORKLOAD_NAME.yaml + echo "WORKLOAD_FILE=$WORKLOAD_FILE" >> $GITHUB_OUTPUT + RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment") + echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT + + pip install pyyaml + python3 presets/workspace/test/scripts/generate_manifests.py ${{ matrix.model.name }} ${{ env.RUNTIME }} \ + --tag ${{ matrix.model.tag }} \ + --repo ${{ secrets.ACR_AMRT_USERNAME }}.azurecr.io > $WORKLOAD_FILE + + cat $WORKLOAD_FILE - name: Create Nodepool run: | NODEPOOL_EXIST=$(az aks nodepool show \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \ --cluster-name GitRunner \ --resource-group llm-test \ --query 'name' -o tsv || echo "") echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" if [ -z "$NODEPOOL_EXIST" ]; then az aks nodepool add \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \ --cluster-name GitRunner \ --resource-group llm-test \ --node-count ${{ matrix.model.node-count }} \ --node-vm-size ${{ matrix.model.node-vm-size }} \ --node-osdisk-size ${{ matrix.model.node-osdisk-size }} \ - --labels pool=${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --labels pool=${{ steps.workload.outputs.NODEPOOL_NAME }} \ --node-taints sku=gpu:NoSchedule \ --aks-custom-headers UseGPUDedicatedVHD=true else NODEPOOL_STATE=$(az aks nodepool show \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \ --cluster-name GitRunner \ --resource-group llm-test \ --query 'provisioningState' -o tsv) @@ -214,44 +220,14 @@ jobs: fi fi - - name: Get testing workload - id: workload + - name: Create workload run: | - WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }} - echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT - echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT - - - name: Create Service - run: | - kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml - - - name: Retrieve External Service IP - id: get_ip - run: | - SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}') - echo "Service IP is $SERVICE_IP" - echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT - - - name: Get Resource Type - id: resource - run: | - RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment") - echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT - - - name: Replace IP and Deploy Resource to K8s - run: | - POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}") - WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml - - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE - sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE - sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE - kubectl apply -f $WORKLOAD_FILE + kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE}} - name: Wait for Resource to be ready run: | - kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s - + kubectl rollout status ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s + - name: Check Adapter Loading from Logs if: matrix.model.loads_adapter == true run: | @@ -260,19 +236,19 @@ jobs: - name: Install testing commands run: | - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl - name: Test healthz endpoint run: | - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s http://localhost:5000/health - name: Test inference endpoint run: | echo "Testing inference for ${{ matrix.model.name }}" if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -293,7 +269,7 @@ jobs: }' \ http://localhost:5000/chat elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -309,7 +285,7 @@ jobs: }' \ http://localhost:5000/generate elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "accept: application/json" \ -H "Content-Type: application/json" \ @@ -328,7 +304,7 @@ jobs: }' \ http://localhost:5000/v1/chat/completions else - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ + kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "accept: application/json" \ -H "Content-Type: application/json" \ @@ -370,9 +346,9 @@ jobs: if: always() run: | # Only proceed if RESOURCE_TYPE is set (else resource wasn't created) - if [ -n "${{ steps.resource.outputs.RESOURCE_TYPE }}" ]; then + if [ -n "${{ steps.workload.outputs.RESOURCE_TYPE }}" ]; then # Use RESOURCE_TYPE from the previous step - RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }} + RESOURCE_TYPE=${{ steps.workload.outputs.RESOURCE_TYPE }} # Check and Delete K8s Resource (Deployment or StatefulSet) if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then @@ -387,16 +363,16 @@ jobs: fi # Check and Delete AKS Nodepool if it exists - if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then + if [ -n "${{ steps.workload.outputs.NODEPOOL_NAME }}" ]; then NODEPOOL_EXIST=$(az aks nodepool show \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \ --cluster-name GitRunner \ --resource-group llm-test \ --query 'name' -o tsv || echo "") if [ -n "$NODEPOOL_EXIST" ]; then az aks nodepool delete \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \ --cluster-name GitRunner \ --resource-group llm-test fi diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml deleted file mode 100644 index fc357931a..000000000 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: falcon-40b-instruct -spec: - selector: - app: falcon - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml deleted file mode 100644 index a44043894..000000000 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-40b-instruct -spec: - progressDeadlineSeconds: 1800 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40bins diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml deleted file mode 100644 index 7b40cbac4..000000000 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-40b-instruct -spec: - progressDeadlineSeconds: 1800 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40bins diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml deleted file mode 100644 index 80ab4b539..000000000 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: falcon-40b -spec: - selector: - app: falcon - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml deleted file mode 100644 index 514d12e60..000000000 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-40b -spec: - progressDeadlineSeconds: 1800 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40b diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml deleted file mode 100644 index 7e74ac7a7..000000000 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-40b -spec: - progressDeadlineSeconds: 1800 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40b diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml deleted file mode 100644 index 2f27d46cb..000000000 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: falcon-7b-instruct -spec: - selector: - app: falcon - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml deleted file mode 100644 index 1b2092b36..000000000 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b-instruct -spec: - progressDeadlineSeconds: 1200 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7binst diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml deleted file mode 100644 index 4019d64f5..000000000 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b-instruct -spec: - progressDeadlineSeconds: 1200 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7binst diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml deleted file mode 100644 index 595e83942..000000000 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: falcon-7b -spec: - selector: - app: falcon - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml deleted file mode 100644 index 56a775fff..000000000 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b -spec: - progressDeadlineSeconds: 1200 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7b diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml deleted file mode 100644 index bceb14560..000000000 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b -spec: - progressDeadlineSeconds: 1200 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7b diff --git a/presets/workspace/test/manifests/inference-tmpl/manifest.yaml b/presets/workspace/test/manifests/inference-tmpl/manifest.yaml new file mode 100644 index 000000000..3609065da --- /dev/null +++ b/presets/workspace/test/manifests/inference-tmpl/manifest.yaml @@ -0,0 +1,100 @@ +deployment: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: WORKLOAD_NAME + spec: + progressDeadlineSeconds: 1800 + replicas: 1 + selector: + matchLabels: + app: WORKLOAD_NAME + template: + metadata: + labels: + app: WORKLOAD_NAME + spec: + containers: + - name: WORKLOAD_NAME-container + image: REPO/MODEL_NAME:TAG + command: + - /bin/sh + - -c + - RUNTIME_COMMAND + resources: + requests: + nvidia.com/gpu: GPU_COUNT + limits: + nvidia.com/gpu: GPU_COUNT + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + - name: config-volume + mountPath: /mnt/config + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: config-volume + configMap: + defaultMode: 420 + name: testing-inference-params + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: NODE_POOL + +config: + apiVersion: v1 + kind: ConfigMap + metadata: + name: testing-inference-params + data: + inference_config.yaml: | + # Maximum number of steps to find the max available seq len fitting in the GPU memory. + max_probe_steps: 6 + + vllm: + cpu-offload-gb: 0 + gpu-memory-utilization: 0.95 + swap-space: 4 + served-model-name: test + dtype: float16 + + # max-seq-len-to-capture: 8192 + # num-scheduler-steps: 1 + # enable-chunked-prefill: false + # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options. + +service: + apiVersion: v1 + kind: Service + metadata: + name: WORKLOAD_NAME + spec: + ports: + - port: 5000 + targetPort: 5000 + protocol: TCP + name: http + selector: + app: WORKLOAD_NAME + type: ClusterIP \ No newline at end of file diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml deleted file mode 100644 index 58720a91d..000000000 --- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: llama-2-13b-chat -spec: - selector: - app: llama - statefulset.kubernetes.io/pod-name: llama-2-13b-chat-0 - ports: - - name: http - protocol: TCP - port: 80 - targetPort: 5000 - - name: torchrun - protocol: TCP - port: 29500 - targetPort: 29500 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml index 2520ddb9c..1bbef236e 100644 --- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml +++ b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml @@ -25,14 +25,12 @@ spec: topologyKey: "kubernetes.io/hostname" containers: - name: llama-container - image: REPO_HERE.azurecr.io/llama-2-13b-chat:TAG_HERE # Placeholder that will be replaced - env: - - name: MASTER_ADDR - value: "MASTER_ADDR_HERE" # Placeholder that will be replaced + image: REPO/llama-2-13b-chat:TAG # Placeholder that will be replaced command: - /bin/sh - -c - | + MASTER_ADDR=$(dig +short llama-2-13b-chat.default.svc.cluster.local) echo "MASTER_ADDR: $MASTER_ADDR" NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$') cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py @@ -69,4 +67,24 @@ spec: key: nvidia.com/gpu operator: Exists nodeSelector: - pool: llama213bcha + pool: llama213bchat +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-2-13b-chat +spec: + selector: + app: llama + statefulset.kubernetes.io/pod-name: llama-2-13b-chat-0 + ports: + - name: http + protocol: TCP + port: 80 + targetPort: 5000 + - name: torchrun + protocol: TCP + port: 29500 + targetPort: 29500 + type: ClusterIP + publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml deleted file mode 100644 index f43826a48..000000000 --- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: llama-2-13b -spec: - selector: - app: llama - statefulset.kubernetes.io/pod-name: llama-2-13b-0 - ports: - - name: http - protocol: TCP - port: 80 - targetPort: 5000 - - name: torchrun - protocol: TCP - port: 29500 - targetPort: 29500 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml index 3bc5f72d2..314d81095 100644 --- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml +++ b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml @@ -25,14 +25,12 @@ spec: topologyKey: "kubernetes.io/hostname" containers: - name: llama-container - image: REPO_HERE.azurecr.io/llama-2-13b:TAG_HERE # Placeholder that will be replaced - env: - - name: MASTER_ADDR - value: "MASTER_ADDR_HERE" # Placeholder that will be replaced + image: REPO/llama-2-13b:TAG # Placeholder that will be replaced command: - /bin/sh - -c - | + MASTER_ADDR=$(dig +short llama-2-13b.default.svc.cluster.local) echo "MASTER_ADDR: $MASTER_ADDR" NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$') cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py @@ -70,3 +68,23 @@ spec: operator: Exists nodeSelector: pool: llama213b +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-2-13b +spec: + selector: + app: llama + statefulset.kubernetes.io/pod-name: llama-2-13b-0 + ports: + - name: http + protocol: TCP + port: 80 + targetPort: 5000 + - name: torchrun + protocol: TCP + port: 29500 + targetPort: 29500 + type: ClusterIP + publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml deleted file mode 100644 index 99fc7895d..000000000 --- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: llama-2-7b-chat -spec: - selector: - app: llama - statefulset.kubernetes.io/pod-name: llama-2-7b-chat-0 - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml index af895fb3c..fb1babc05 100644 --- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml +++ b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: llama-container - image: REPO_HERE.azurecr.io/llama-2-7b-chat:TAG_HERE + image: REPO/llama-2-7b-chat:TAG command: - /bin/sh - -c @@ -54,3 +54,18 @@ spec: operator: Exists nodeSelector: pool: llama27bchat +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-2-7b-chat +spec: + selector: + app: llama + statefulset.kubernetes.io/pod-name: llama-2-7b-chat-0 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: ClusterIP + publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml deleted file mode 100644 index d8dfb84c7..000000000 --- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: llama-2-7b -spec: - selector: - app: llama - statefulset.kubernetes.io/pod-name: llama-2-7b-0 - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml index 265b3c1a1..1179d0347 100644 --- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml +++ b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: llama-container - image: REPO_HERE.azurecr.io/llama-2-7b:TAG_HERE + image: REPO/llama-2-7b:TAG command: - /bin/sh - -c @@ -54,3 +54,18 @@ spec: operator: Exists nodeSelector: pool: llama27b +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-2-7b +spec: + selector: + app: llama + statefulset.kubernetes.io/pod-name: llama-2-7b-0 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: ClusterIP + publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml deleted file mode 100644 index 94627746d..000000000 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: mistral-7b-instruct -spec: - selector: - app: mistral - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml deleted file mode 100644 index 75179683f..000000000 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: mistral - template: - metadata: - labels: - app: mistral - spec: - containers: - - name: mistral-instruct-container - image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: mistral7bins diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml deleted file mode 100644 index 939d6c75b..000000000 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: mistral - template: - metadata: - labels: - app: mistral - spec: - containers: - - name: mistral-instruct-container - image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: mistral7bins diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml deleted file mode 100644 index 90ba3ec8f..000000000 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: mistral-7b -spec: - selector: - app: mistral - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml deleted file mode 100644 index 3eff5594f..000000000 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral - template: - metadata: - labels: - app: mistral - spec: - containers: - - name: mistral-container - image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: mistral7b diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml deleted file mode 100644 index 2bd945319..000000000 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral - template: - metadata: - labels: - app: mistral - spec: - containers: - - name: mistral-container - image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: mistral7b diff --git a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml deleted file mode 100644 index d0f99f9ad..000000000 --- a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-2 -spec: - selector: - app: phi-2 - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml deleted file mode 100644 index cbc6f94e7..000000000 --- a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-2 -spec: - replicas: 1 - selector: - matchLabels: - app: phi-2 - template: - metadata: - labels: - app: phi-2 - spec: - containers: - - name: phi-2-container - image: REPO_HERE.azurecr.io/phi-2:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi2 diff --git a/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml b/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml deleted file mode 100644 index e77f21268..000000000 --- a/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-2 -spec: - replicas: 1 - selector: - matchLabels: - app: phi-2 - template: - metadata: - labels: - app: phi-2 - spec: - containers: - - name: phi-2-container - image: REPO_HERE.azurecr.io/phi-2:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi2 diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml deleted file mode 100644 index bab354ee9..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-3-medium-128k-instruct -spec: - selector: - app: phi-3-medium-128k-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml deleted file mode 100644 index 0adb122e4..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-medium-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-medium-128k-instruct - template: - metadata: - labels: - app: phi-3-medium-128k-instruct - spec: - containers: - - name: phi-3-medium-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3medium12 \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml deleted file mode 100644 index 5b93bde50..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-medium-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-medium-128k-instruct - template: - metadata: - labels: - app: phi-3-medium-128k-instruct - spec: - containers: - - name: phi-3-medium-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --max-model-len 1024 --tensor-parallel-size 2 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3medium12 \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml deleted file mode 100644 index 60710504f..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-3-medium-4k-instruct -spec: - selector: - app: phi-3-medium-4k-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml deleted file mode 100644 index 1d0d64e47..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-medium-4k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-medium-4k-instruct - template: - metadata: - labels: - app: phi-3-medium-4k-instruct - spec: - containers: - - name: phi-3-medium-4k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3medium4k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml deleted file mode 100644 index 3bdce8072..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-medium-4k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-medium-4k-instruct - template: - metadata: - labels: - app: phi-3-medium-4k-instruct - spec: - containers: - - name: phi-3-medium-4k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --tensor-parallel-size 2 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3medium4k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml deleted file mode 100644 index ef86aefb2..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-3-mini-128k-instruct -spec: - selector: - app: phi-3-mini-128k-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml deleted file mode 100644 index cf8898015..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-mini-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-mini-128k-instruct - template: - metadata: - labels: - app: phi-3-mini-128k-instruct - spec: - containers: - - name: phi-3-mini-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3mini128k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml deleted file mode 100644 index f719bf96b..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-mini-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-mini-128k-instruct - template: - metadata: - labels: - app: phi-3-mini-128k-instruct - spec: - containers: - - name: phi-3-mini-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3mini128k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml deleted file mode 100644 index 0063f24aa..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-3-mini-4k-instruct -spec: - selector: - app: phi-3-mini-4k-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml deleted file mode 100644 index 1d7069a38..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-mini-4k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-mini-4k-instruct - template: - metadata: - labels: - app: phi-3-mini-4k-instruct - spec: - containers: - - name: phi-3-mini-4k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3mini4kin \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml deleted file mode 100644 index 8d1275678..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-mini-4k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-mini-4k-instruct - template: - metadata: - labels: - app: phi-3-mini-4k-instruct - spec: - containers: - - name: phi-3-mini-4k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3mini4kin \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml deleted file mode 100644 index a28bac071..000000000 --- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-3-small-128k-instruct -spec: - selector: - app: phi-3-small-128k-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml deleted file mode 100644 index 1827155f4..000000000 --- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-small-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-small-128k-instruct - template: - metadata: - labels: - app: phi-3-small-128k-instruct - spec: - containers: - - name: phi-3-small-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-small-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3small128 \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml deleted file mode 100644 index 17e031f87..000000000 --- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: phi-3-small-8k-instruct -spec: - selector: - app: phi-3-small-8k-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml deleted file mode 100644 index 1f515cc6a..000000000 --- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-small-8k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-small-8k-instruct - template: - metadata: - labels: - app: phi-3-small-8k-instruct - spec: - containers: - - name: phi-3-small-8k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-small-8k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3small8ki \ No newline at end of file diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml deleted file mode 100644 index 73637c99a..000000000 --- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: qwen2-5-coder-7b-instruct -spec: - selector: - app: qwen2-5-coder-7b-instruct - ports: - - protocol: TCP - port: 80 - targetPort: 5000 - type: ClusterIP - publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml deleted file mode 100644 index e92d906d7..000000000 --- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: qwen2-5-coder-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: qwen2-5-coder-7b-instruct - template: - metadata: - labels: - app: qwen2-5-coder-7b-instruct - spec: - containers: - - name: qwen2-5-coder-7b-instruct-container - image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: qwen25coder7 \ No newline at end of file diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml deleted file mode 100644 index 4c1e72510..000000000 --- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml +++ /dev/null @@ -1,83 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: qwen2-5-coder-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: qwen2-5-coder-7b-instruct - template: - metadata: - labels: - app: qwen2-5-coder-7b-instruct - spec: - containers: - - name: qwen2-5-coder-7b-instruct-container - image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - - mountPath: /mnt/config - name: config-volume - volumes: - - name: dshm - emptyDir: - medium: Memory - - configMap: - defaultMode: 420 - name: qwen2-5-coder-7b-inference-params - name: config-volume - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: qwen25coder7 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: qwen2-5-coder-7b-inference-params -data: - inference_config.yaml: | - # Maximum number of steps to find the max available seq len fitting in the GPU memory. - max_probe_steps: 6 - - vllm: - cpu-offload-gb: 0 - gpu-memory-utilization: 0.95 - swap-space: 4 - served-model-name: test - dtype: float16 - tensor-parallel-size: 2 - - # max-seq-len-to-capture: 8192 - # num-scheduler-steps: 1 - # enable-chunked-prefill: false - # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options. diff --git a/presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml b/presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml deleted file mode 100644 index 7852bf01a..000000000 --- a/presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml +++ /dev/null @@ -1,103 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b-tuning -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: aimodelsregistrytest.azurecr.io/tuning-falcon-7b:0.0.1 - command: ["/bin/sh", "-c", "sleep infinity"] - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs - volumeMounts: - - name: dshm - mountPath: /dev/shm - - name: workspace - mountPath: /workspace - - - name: docker-sidecar - image: docker:dind - securityContext: - privileged: true # Allows container to manage its own containers - volumeMounts: - - name: workspace - mountPath: /workspace - env: - - name: ACR_USERNAME - value: "{{ACR_USERNAME}}" - - name: ACR_PASSWORD - value: "{{ACR_PASSWORD}}" - - name: TAG - value: "{{TAG}}" - command: ["/bin/sh"] - args: - - -c - - | - # Start the Docker daemon in the background with specific options for DinD - dockerd & - # Wait for the Docker daemon to be ready - while ! docker info > /dev/null 2>&1; do - echo "Waiting for Docker daemon to start..." - sleep 1 - done - echo 'Docker daemon started' - - while true; do - FILE_PATH=$(find /workspace/tfs -name 'fine_tuning_completed.txt') - if [ ! -z "$FILE_PATH" ]; then - echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH" - - PARENT_DIR=$(dirname "$FILE_PATH") - echo "Parent directory is $PARENT_DIR" - - TEMP_CONTEXT=$(mktemp -d) - cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json" - cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors" - - # Create a minimal Dockerfile - echo 'FROM scratch - ADD adapter_config.json / - ADD adapter_model.safetensors /' > "$TEMP_CONTEXT/Dockerfile" - - # Login to Docker registry - echo $ACR_PASSWORD | docker login $ACR_USERNAME.azurecr.io -u $ACR_USERNAME --password-stdin - - docker build -t $ACR_USERNAME.azurecr.io/adapter-falcon-7b:$TAG "$TEMP_CONTEXT" - docker push $ACR_USERNAME.azurecr.io/adapter-falcon-7b:$TAG - - # Cleanup: Remove the temporary directory - rm -rf "$TEMP_CONTEXT" - - # Remove the file to prevent repeated builds, or handle as needed - # rm "$FILE_PATH" - fi - sleep 10 # Check every 10 seconds - done - - volumes: - - name: dshm - emptyDir: - medium: Memory - - name: workspace - emptyDir: {} - - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists diff --git a/presets/workspace/test/tuning/tuning-job.yaml b/presets/workspace/test/manifests/tuning/tuning.yaml similarity index 100% rename from presets/workspace/test/tuning/tuning-job.yaml rename to presets/workspace/test/manifests/tuning/tuning.yaml diff --git a/presets/workspace/test/scripts/README.md b/presets/workspace/test/scripts/README.md new file mode 100644 index 000000000..aaa49ca69 --- /dev/null +++ b/presets/workspace/test/scripts/README.md @@ -0,0 +1,82 @@ +# Manifest Generation Script + +This script generates Kubernetes manifests for different model deployments based on templates and configurations. + +## Overview + +The script (`generate_manifests.py`) combines: +- Base templates from `presets/workspace/test/manifests/inference-tmpl/manifest.yaml` +- Model configurations from `.github/e2e-preset-configs.json` +- Predefined manifests from `presets/workspace/test/manifests//.yaml` +to generate deployment/statefulset and service manifests for each model. + +## Usage + +```bash +# Print manifests to stdout +python generate_manifests.py [--repo REPO] [--tag TAG] +``` + +### Parameters: +- `model_name`: Name of the model (e.g., "mistral-7b", "falcon-40b") +- `runtime`: Runtime to use ("hf" for Hugging Face or "vllm" for vLLM) +- `--repo`: (Optional) Repository name to replace REPO in the template +- `--tag`: (Optional) Tag to replace TAG in the template + +### Example: +```bash +# Generate manifest for Mistral 7B with vLLM runtime +python generate_manifests.py mistral-7b vllm + +# Generate manifest for Falcon 40B with Hugging Face runtime and custom repo/tag +python generate_manifests.py falcon-40b hf --repo myregistry.azurecr.io --tag v1.0.0 +``` + +## Configuration Files + +### Template (presets/workspace/test/manifests/inference-tmpl/manifest.yaml) +Contains base templates for: +- Deployment +- Service +- ConfigMap + +Placeholders: +- MODEL_NAME +- RUNTIME_COMMAND +- GPU_COUNT +- NODE_POOL +- NODE_COUNT +- REPO (can be overridden with --repo flag) +- TAG (can be overridden with --tag flag) + +### Model Configurations +Located at `.github/e2e-preset-configs.json` +- Contains configurations for all supported models +- Each model configuration includes: + - Basic info (name, node count, VM size, etc.) + - Node pool specification + - Runtime-specific configurations + +## Example Configuration + +```json +{ + "name": "mistral-7b", + "node-count": 1, + "node-vm-size": "Standard_NC6s_v3", + "node-osdisk-size": 100, + "OSS": true, + "loads_adapter": false, + "node_pool": "mistral7b", + "runtimes": { + "hf": { + "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16", + "gpu_count": 1 + }, + "vllm": { + "command": "python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja", + "gpu_count": 1 + } + } +} +``` diff --git a/presets/workspace/test/scripts/generate_manifests.py b/presets/workspace/test/scripts/generate_manifests.py new file mode 100644 index 000000000..defc2d80a --- /dev/null +++ b/presets/workspace/test/scripts/generate_manifests.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +import json +import yaml +import sys +import os +import argparse +from pathlib import Path + +def get_project_root(): + # Get the directory containing the current script + current_dir = Path(__file__).resolve().parent + # Go up 4 levels to reach project root (from presets/workspace/test/scripts) + return current_dir.parents[3] + +def load_json_config(): + project_root = get_project_root() + config_path = project_root / '.github' / 'e2e-preset-configs.json' + with open(config_path, 'r') as f: + data = json.load(f) + return data['matrix']['image'] # Return the array of model configs + +def load_template(): + project_root = get_project_root() + template_path = project_root / 'presets' / 'workspace' / 'test' / 'manifests' / 'inference-tmpl' / 'manifest.yaml' + with open(template_path, 'r') as f: + return yaml.safe_load(f) + +def check_predefined_manifest(model_name): + project_root = get_project_root() + manifest_path = project_root / 'presets' / 'workspace' / 'test' / 'manifests' / f'{model_name}' / f'{model_name}.yaml' + if not os.path.exists(manifest_path): + return (None, False) + with open(manifest_path, 'r') as f: + return (f.read(), True) + +def process_model(model_name, runtime, repo=None, tag=None): + configs = load_json_config() + model_config = next((m for m in configs if m['name'] == model_name), None) + + if not model_config: + print(f"Model {model_name} not found in configs", file=sys.stderr) + sys.exit(1) + + predefined_manifest, predefined_manifest_exists = check_predefined_manifest(model_name) + if predefined_manifest_exists: + return process_predefined_manifest(model_name, runtime, predefined_manifest, repo, tag) + + if runtime not in model_config.get('runtimes', {}): + print(f"Runtime {runtime} not configured for model {model_name}", file=sys.stderr) + sys.exit(1) + + runtime_config = model_config['runtimes'][runtime] + workload_name = model_config.get('workload', model_name) + + templates = load_template() + manifest_str = yaml.dump(templates['deployment']) + + # Replace placeholders in template + manifest_str = ( + manifest_str + .replace('WORKLOAD_NAME', workload_name) + .replace('MODEL_NAME', model_name) + .replace('RUNTIME_COMMAND', runtime_config['command']) + .replace('GPU_COUNT', str(runtime_config['gpu_count'])) + .replace('NODE_POOL', model_config['node_pool']) + .replace('NODE_COUNT', str(model_config['node-count'])) + ) + + # Replace repo and tag if provided + if repo: + manifest_str = manifest_str.replace('REPO', repo) + if tag: + manifest_str = manifest_str.replace('TAG', tag) + + # Parse the template string back into YAML + manifest = yaml.safe_load(manifest_str) + + # Generate service manifest + service_template = templates['service'] + service_str = yaml.dump(service_template) + service_str = service_str.replace('WORKLOAD_NAME', workload_name) + service_manifest = yaml.safe_load(service_str) + + # Generate config manifest + config_template = templates['config'] + config_str = yaml.dump(config_template) + config_manifest = yaml.safe_load(config_str) + + # Print manifests to stdout + yaml.dump(manifest, sys.stdout, default_flow_style=False) + print('---') # Document separator + yaml.dump(service_manifest, sys.stdout, default_flow_style=False) + print('---') # Document separator + yaml.dump(config_manifest, sys.stdout, default_flow_style=False) + +def process_predefined_manifest(model_name, runtime, predefined_manifest, repo=None, tag=None): + if repo: + predefined_manifest = predefined_manifest.replace('REPO', repo) + if tag: + predefined_manifest = predefined_manifest.replace('TAG', tag) + + print(predefined_manifest) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Process model template with optional repo and tag.') + parser.add_argument('model_name', help='Name of the model') + parser.add_argument('runtime', help='Runtime to use') + parser.add_argument('--repo', help='Repository name to use instead of REPO') + parser.add_argument('--tag', help='Tag to use instead of TAG') + + args = parser.parse_args() + process_model(args.model_name, args.runtime, args.repo, args.tag)