refactor: add inference manifest template and generation script

kaito-project · Jan 17, 2025 · cb949fc · cb949fc
1 parent 3755bab
commit cb949fc
Show file tree

Hide file tree

Showing 55 changed files with 564 additions and 1,917 deletions.
diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
@@ -7,96 +7,208 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
-      },
-      {
-        "name": "falcon-7b-adapter",
-        "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
-        "node-osdisk-size": 100,
-        "OSS": true,
-        "loads_adapter": true,
-        "expected_adapter": "amod-mental-health"
+        "loads_adapter": false,
+        "node_pool": "falcon7b",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "falcon-7b-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "falcon7binst",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "falcon-40b",
         "node-count": 1,
         "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "falcon40b",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 2
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "falcon-40b-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "falcon40bins",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 2
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "mistral-7b",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "mistral7b",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "mistral-7b-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "mistral7bins",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-2",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 50,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi2",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-3-mini-4k-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 50,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3mini4kin",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-3-mini-128k-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 50,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3mini128k",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-3-medium-4k-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3medium4k",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "phi-3-medium-128k-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3medium12",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --max-model-len 1024 --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "qwen2.5-coder-7b-instruct",
@@ -105,47 +217,63 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "qwen25coder7",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "llama-2-7b",
         "node-count": 1,
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama27b"
       },
       {
         "name": "llama-2-7b-chat",
         "node-count": 1,
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama27bchat"
       },
       {
         "name": "llama-2-13b",
         "node-count": 2,
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 150,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama213b"
       },
       {
         "name": "llama-2-13b-chat",
         "node-count": 2,
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 150,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama213bchat"
       },
       {
         "name": "tuning",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "tuning"
       }
     ]
   }