Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Commenting and Renaming #194

Merged
merged 14 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/kind-cluster/docker-job-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: Job
metadata:
name: docker-build-job-{{JOB_ID}}
spec:
ttlSecondsAfterFinished: 600 # Job and its pods are deleted 10 min after job completion
backoffLimit: 3 # Number of retries before marking the job as failed
template:
spec:
Expand Down
16 changes: 9 additions & 7 deletions .github/workflows/kind-cluster/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import subprocess
import os
import shutil
from pathlib import Path
import time
import random
import shutil
import string
import subprocess
import time
from pathlib import Path

import yaml

KAITO_REPO_URL = "https://github.com/Azure/kaito.git"
Expand Down Expand Up @@ -85,7 +86,8 @@ def write_job_file(job_yaml, job_name):
def populate_job_template(model, img_tag, job_name, env_vars):
"""Populate the job template with provided values."""
try:
with open("/home/azureuser/docker-job-template.yaml", "r") as file:
docker_job_template = Path.cwd() / "repo/.github/workflows/kind-cluster/docker-job-template.yaml"
with open(docker_job_template, "r") as file:
job_template = file.read()

replacements = {
Expand Down Expand Up @@ -145,10 +147,10 @@ def check_job_status(job_name):
return "succeeded"
elif failed and int(failed) > 0:
return "failed"
else:
else:
return "running"

def wait_for_jobs_to_complete(job_names, timeout=3600):
def wait_for_jobs_to_complete(job_names, timeout=10800):
"""Wait for all jobs to complete with a timeout."""
start_time = time.time()
while time.time() - start_time < timeout:
Expand Down
4 changes: 2 additions & 2 deletions pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -420,8 +420,8 @@ func (c *WorkspaceReconciler) ensureService(ctx context.Context, wObj *kaitov1al
}

func (c *WorkspaceReconciler) updateInferenceParamFromWorkspace(ctx context.Context, wObj *kaitov1alpha1.Workspace, inferenceParam *model.PresetInferenceParam) {
inferenceParam.AccessMode = string(wObj.Inference.Preset.PresetMeta.AccessMode)
if inferenceParam.AccessMode == "private" && wObj.Inference.Preset.PresetOptions.Image != "" {
inferenceParam.ImageAccessMode = string(wObj.Inference.Preset.PresetMeta.AccessMode)
if inferenceParam.ImageAccessMode == "private" && wObj.Inference.Preset.PresetOptions.Image != "" {
inferenceParam.Image = wObj.Inference.Preset.PresetOptions.Image

imagePullSecretRefs := []corev1.LocalObjectReference{}
Expand Down
67 changes: 22 additions & 45 deletions pkg/inference/preset-inference-types.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,12 @@ var (
presetFalcon40bInstructImage = registryName + fmt.Sprintf("/kaito-%s:0.0.1", PresetFalcon40BInstructModel)

baseCommandPresetLlama = "cd /workspace/llama/llama-2 && torchrun"
// llamaTextInferenceFile = "inference-api.py" TODO: To support Text Generation Llama Models
llamaChatInferenceFile = "inference-api.py"
llamaRunParams = map[string]string{
"max_seq_len": "512",
"max_batch_size": "8",
}

baseCommandPresetFalcon = "accelerate launch --use_deepspeed"
falconInferenceFile = "inference-api.py"
falconRunParams = map[string]string{}

defaultTorchRunParams = map[string]string{
Expand All @@ -93,7 +90,7 @@ var (
"gpu_ids": DefaultGPUIds,
}

defaultAccessMode = "public"
defaultImageAccessMode = "public"
defaultImagePullSecrets = []corev1.LocalObjectReference{}
)

Expand Down Expand Up @@ -122,7 +119,7 @@ var (
"gpu_ids": DefaultGPUIds,
}

DefaultAccessMode = "public"
DefaultImageAccessMode = "public"
DefaultImagePullSecrets = []corev1.LocalObjectReference{}
)

Expand All @@ -132,172 +129,152 @@ var (
Llama2PresetInferences = map[string]model.PresetInferenceParam{

PresetLlama2AChat: {
ModelName: "LLaMa2",
ModelFamilyName: "LLaMa2",
Image: "",
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "34Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
TorchRunParams: defaultTorchRunParams,
TorchRunRdzvParams: defaultTorchRunRdzvParams,
ModelRunParams: llamaRunParams,
InferenceFile: llamaChatInferenceFile,
DeploymentTimeout: time.Duration(10) * time.Minute,
BaseCommand: baseCommandPresetLlama,
WorldSize: 1,
DefaultVolumeMountPath: "/dev/shm",
},
PresetLlama2AModel: {
ModelName: "LLaMa2",
ModelFamilyName: "LLaMa2",
Image: "",
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "34Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
TorchRunParams: defaultTorchRunParams,
TorchRunRdzvParams: defaultTorchRunRdzvParams,
ModelRunParams: llamaRunParams,
InferenceFile: llamaChatInferenceFile,
DeploymentTimeout: time.Duration(10) * time.Minute,
BaseCommand: baseCommandPresetLlama,
WorldSize: 1,
DefaultVolumeMountPath: "/dev/shm",
},
PresetLlama2BChat: {
ModelName: "LLaMa2",
ModelFamilyName: "LLaMa2",
Image: "",
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "46Gi",
GPUCountRequirement: "2",
TotalGPUMemoryRequirement: "16Gi",
TorchRunParams: defaultTorchRunParams,
TorchRunRdzvParams: defaultTorchRunRdzvParams,
ModelRunParams: llamaRunParams,
InferenceFile: llamaChatInferenceFile,
DeploymentTimeout: time.Duration(20) * time.Minute,
BaseCommand: baseCommandPresetLlama,
WorldSize: 2,
DefaultVolumeMountPath: "/dev/shm",
},
PresetLlama2BModel: {
ModelName: "LLaMa2",
ModelFamilyName: "LLaMa2",
Image: "",
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "46Gi",
GPUCountRequirement: "2",
TotalGPUMemoryRequirement: "16Gi",
TorchRunParams: defaultTorchRunParams,
TorchRunRdzvParams: defaultTorchRunRdzvParams,
ModelRunParams: llamaRunParams,
InferenceFile: llamaChatInferenceFile,
DeploymentTimeout: time.Duration(20) * time.Minute,
BaseCommand: baseCommandPresetLlama,
WorldSize: 2,
DefaultVolumeMountPath: "/dev/shm",
},
PresetLlama2CChat: {
ModelName: "LLaMa2",
ModelFamilyName: "LLaMa2",
Image: "",
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "158Gi",
GPUCountRequirement: "8",
TotalGPUMemoryRequirement: "19Gi",
TorchRunParams: defaultTorchRunParams,
TorchRunRdzvParams: defaultTorchRunRdzvParams,
ModelRunParams: llamaRunParams,
InferenceFile: llamaChatInferenceFile,
DeploymentTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetLlama,
WorldSize: 8,
DefaultVolumeMountPath: "/dev/shm",
},
PresetLlama2CModel: {
ModelName: "LLaMa2",
ModelFamilyName: "LLaMa2",
Image: "",
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "158Gi",
GPUCountRequirement: "8",
TotalGPUMemoryRequirement: "19Gi",
TorchRunParams: defaultTorchRunParams,
TorchRunRdzvParams: defaultTorchRunRdzvParams,
ModelRunParams: llamaRunParams,
InferenceFile: llamaChatInferenceFile,
DeploymentTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetLlama,
WorldSize: 8,
DefaultVolumeMountPath: "/dev/shm",
},
}

// FalconPresetInferences defines the preset inferences for Falcon.
FalconPresetInferences = map[string]model.PresetInferenceParam{
PresetFalcon7BModel: {
ModelName: "Falcon",
ModelFamilyName: "Falcon",
Image: presetFalcon7bImage,
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "14Gi",
TorchRunParams: defaultAccelerateParams,
ModelRunParams: falconRunParams,
InferenceFile: falconInferenceFile,
DeploymentTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetFalcon,
DefaultVolumeMountPath: "/dev/shm",
},
PresetFalcon7BInstructModel: {
ModelName: "Falcon",
ModelFamilyName: "Falcon",
Image: presetFalcon7bInstructImage,
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "14Gi",
TorchRunParams: defaultAccelerateParams,
ModelRunParams: falconRunParams,
InferenceFile: falconInferenceFile,
DeploymentTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetFalcon,
DefaultVolumeMountPath: "/dev/shm",
},

PresetFalcon40BModel: {
ModelName: "Falcon",
ModelFamilyName: "Falcon",
Image: presetFalcon40bImage,
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "400",
GPUCountRequirement: "2",
TotalGPUMemoryRequirement: "90Gi",
TorchRunParams: defaultAccelerateParams,
ModelRunParams: falconRunParams,
InferenceFile: falconInferenceFile,
DeploymentTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetFalcon,
DefaultVolumeMountPath: "/dev/shm",
},

PresetFalcon40BInstructModel: {
ModelName: "Falcon",
ModelFamilyName: "Falcon",
Image: presetFalcon40bInstructImage,
ImagePullSecrets: defaultImagePullSecrets,
AccessMode: defaultAccessMode,
ImageAccessMode: defaultImageAccessMode,
DiskStorageRequirement: "400",
GPUCountRequirement: "2",
TotalGPUMemoryRequirement: "90Gi",
TorchRunParams: defaultAccelerateParams,
ModelRunParams: falconRunParams,
InferenceFile: falconInferenceFile,
DeploymentTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetFalcon,
DefaultVolumeMountPath: "/dev/shm",
},
}
)
10 changes: 8 additions & 2 deletions pkg/inference/preset-inferences.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
const (
ProbePath = "/healthz"
Port5000 = int32(5000)
InferenceFile = "inference-api.py"
DefaultVolumeMountPath = "/dev/shm"
)

var (
Expand Down Expand Up @@ -116,10 +118,14 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work
return depObj, nil
}

// prepareInferenceParameters builds a PyTorch command:
// torchrun <TORCH_PARAMS> <OPTIONAL_RDZV_PARAMS> baseCommand <MODEL_PARAMS>
// and sets the GPU resources required for inference.
// Returns the command and resource configuration.
func prepareInferenceParameters(ctx context.Context, inferenceObj *model.PresetInferenceParam) ([]string, corev1.ResourceRequirements) {
torchCommand := buildCommandStr(inferenceObj.BaseCommand, inferenceObj.TorchRunParams)
torchCommand = buildCommandStr(torchCommand, inferenceObj.TorchRunRdzvParams)
modelCommand := buildCommandStr(inferenceObj.InferenceFile, inferenceObj.ModelRunParams)
modelCommand := buildCommandStr(InferenceFile, inferenceObj.ModelRunParams)
commands := shellCommand(torchCommand + " " + modelCommand)

resourceRequirements := corev1.ResourceRequirements{
Expand Down Expand Up @@ -152,7 +158,7 @@ func configVolume(wObj *kaitov1alpha1.Workspace, inferenceObj *model.PresetInfer

volumeMount = append(volumeMount, corev1.VolumeMount{
Name: volume[0].Name,
MountPath: inferenceObj.DefaultVolumeMountPath,
MountPath: DefaultVolumeMountPath,
})
}

Expand Down
33 changes: 16 additions & 17 deletions pkg/model/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,25 @@ type Model interface {
SupportDistributedInference() bool //If true, the model workload will be a StatefulSet, using the torch elastic runtime framework.
}

// PresetInferenceParam defines the preset inference.
// PresetInferenceParam defines the preset inference parameters for a model.
type PresetInferenceParam struct {
ModelName string
Image string
ImagePullSecrets []corev1.LocalObjectReference
AccessMode string
DiskStorageRequirement string
GPUCountRequirement string
TotalGPUMemoryRequirement string
PerGPUMemoryRequirement string
TorchRunParams map[string]string
TorchRunRdzvParams map[string]string
ModelRunParams map[string]string
InferenceFile string
ModelFamilyName string // The name of the model family.
Image string // Docker image used for running the inference.
ImagePullSecrets []corev1.LocalObjectReference // Secrets for pulling the image from a private registry.
ImageAccessMode string // Defines where the Image is Public or Private.
DiskStorageRequirement string // Disk storage requirements for the model.
GPUCountRequirement string // Number of GPUs required for the inference.
TotalGPUMemoryRequirement string // Total GPU memory required for the inference.
PerGPUMemoryRequirement string // GPU memory required per GPU.
TorchRunParams map[string]string // Parameters for configuring the torchrun command.
TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed inference using torchrun (elastic).
ModelRunParams map[string]string // Parameters for running the model inference.
// DeploymentTimeout defines the maximum duration for pulling the Preset image.
// This timeout accommodates the size of PresetX, ensuring pull completion
// This timeout accommodates the size of the image, ensuring pull completion
// even under slower network conditions or unforeseen delays.
DeploymentTimeout time.Duration
BaseCommand string
// WorldSize defines num of processes required for inference
// BaseCommand is the initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
BaseCommand string
// WorldSize defines the number of processes required for distributed inference.
WorldSize int
DefaultVolumeMountPath string
}
Loading
Loading