Skip to content

Commit

Permalink
refactor: add inference manifest template and generation script
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuangqh committed Jan 17, 2025
1 parent 3755bab commit cb949fc
Show file tree
Hide file tree
Showing 55 changed files with 564 additions and 1,917 deletions.
180 changes: 154 additions & 26 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,96 +7,208 @@
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "falcon-7b-adapter",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": true,
"expected_adapter": "amod-mental-health"
"loads_adapter": false,
"node_pool": "falcon7b",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja",
"gpu_count": 1
}
}
},
{
"name": "falcon-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "falcon7binst",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
"gpu_count": 2
}
}
},
{
"name": "falcon-40b",
"node-count": 1,
"node-vm-size": "Standard_NC48ads_A100_v4",
"node-osdisk-size": 400,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "falcon40b",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 2
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
"gpu_count": 2
}
}
},
{
"name": "falcon-40b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC48ads_A100_v4",
"node-osdisk-size": 400,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "falcon40bins",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 2
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
"gpu_count": 2
}
}
},
{
"name": "mistral-7b",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "mistral7b",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja",
"gpu_count": 1
}
}
},
{
"name": "mistral-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "mistral7bins",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16",
"gpu_count": 1
}
}
},
{
"name": "phi-2",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 50,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "phi2",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16",
"gpu_count": 1
}
}
},
{
"name": "phi-3-mini-4k-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 50,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "phi3mini4kin",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16",
"gpu_count": 1
}
}
},
{
"name": "phi-3-mini-128k-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 50,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "phi3mini128k",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16",
"gpu_count": 1
}
}
},
{
"name": "phi-3-medium-4k-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "phi3medium4k",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16 --tensor-parallel-size 2",
"gpu_count": 2
}
}
},
{
"name": "phi-3-medium-128k-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "phi3medium12",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --dtype float16 --max-model-len 1024 --tensor-parallel-size 2",
"gpu_count": 2
}
}
},
{
"name": "qwen2.5-coder-7b-instruct",
Expand All @@ -105,47 +217,63 @@
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "qwen25coder7",
"runtimes": {
"hf": {
"command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
"gpu_count": 1
},
"vllm": {
"command": "python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml --tensor-parallel-size 2",
"gpu_count": 2
}
}
},
{
"name": "llama-2-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": false,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "llama27b"
},
{
"name": "llama-2-7b-chat",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": false,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "llama27bchat"
},
{
"name": "llama-2-13b",
"node-count": 2,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 150,
"OSS": false,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "llama213b"
},
{
"name": "llama-2-13b-chat",
"node-count": 2,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 150,
"OSS": false,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "llama213bchat"
},
{
"name": "tuning",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
"loads_adapter": false,
"node_pool": "tuning"
}
]
}
Expand Down
Loading

0 comments on commit cb949fc

Please sign in to comment.