diff --git a/charts/kaito/workspace/templates/clusterrole.yaml b/charts/kaito/workspace/templates/clusterrole.yaml index 137144614..f8d6a62ee 100644 --- a/charts/kaito/workspace/templates/clusterrole.yaml +++ b/charts/kaito/workspace/templates/clusterrole.yaml @@ -39,6 +39,9 @@ rules: - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations"] verbs: ["get","list","watch"] + - apiGroups: [ "batch" ] + resources: [ "jobs" ] + verbs: [ "get", "list", "watch", "create", "delete","update", "patch" ] - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations"] verbs: ["update"] diff --git a/pkg/resources/manifests.go b/pkg/resources/manifests.go index 68c59fc25..48ecb854f 100644 --- a/pkg/resources/manifests.go +++ b/pkg/resources/manifests.go @@ -5,6 +5,7 @@ package resources import ( "context" "fmt" + batchv1 "k8s.io/api/batch/v1" "k8s.io/utils/pointer" @@ -187,15 +188,30 @@ func GenerateStatefulSetManifest(ctx context.Context, workspaceObj *kaitov1alpha func GenerateTuningJobManifest(ctx context.Context, wObj *kaitov1alpha1.Workspace, imageName string, imagePullSecretRefs []corev1.LocalObjectReference, replicas int, commands []string, containerPorts []corev1.ContainerPort, livenessProbe, readinessProbe *corev1.Probe, resourceRequirements corev1.ResourceRequirements, tolerations []corev1.Toleration, - initContainers []corev1.Container, volumes []corev1.Volume, volumeMounts []corev1.VolumeMount) *batchv1.Job { + initContainers []corev1.Container, sidecarContainers []corev1.Container, volumes []corev1.Volume, volumeMounts []corev1.VolumeMount) *batchv1.Job { labels := map[string]string{ kaitov1alpha1.LabelWorkspaceName: wObj.Name, } - //TODO: - // Will be included in future PR, this code includes - // bash script for pushing results based on user - // data destination method - //pushMethod, pushArg := determinePushMethod(wObj) + + // Add volume mounts to sidecar containers + for i := range sidecarContainers { + sidecarContainers[i].VolumeMounts = append(sidecarContainers[i].VolumeMounts, volumeMounts...) + } + + // Construct the complete list of containers (main and sidecars) + containers := append([]corev1.Container{ + { + Name: wObj.Name, + Image: imageName, + Command: commands, + Resources: resourceRequirements, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, + Ports: containerPorts, + VolumeMounts: volumeMounts, + }, + }, sidecarContainers...) + return &batchv1.Job{ TypeMeta: v1.TypeMeta{ APIVersion: "batch/v1", @@ -221,29 +237,8 @@ func GenerateTuningJobManifest(ctx context.Context, wObj *kaitov1alpha1.Workspac Labels: labels, }, Spec: corev1.PodSpec{ - InitContainers: initContainers, - Containers: []corev1.Container{ - { - Name: wObj.Name, - Image: imageName, - Command: commands, - Resources: resourceRequirements, - LivenessProbe: livenessProbe, - ReadinessProbe: readinessProbe, - Ports: containerPorts, - VolumeMounts: volumeMounts, - }, - { - Name: "docker-sidecar", - Image: "docker:dind", - SecurityContext: &corev1.SecurityContext{ - Privileged: pointer.BoolPtr(true), - }, - VolumeMounts: volumeMounts, - Command: []string{"/bin/sh", "-c"}, - // TODO: Args: []string{pushMethod(pushArg)}, - }, - }, + InitContainers: initContainers, + Containers: containers, RestartPolicy: corev1.RestartPolicyNever, Volumes: volumes, Tolerations: tolerations, @@ -390,5 +385,4 @@ func GenerateDeploymentManifestWithPodTemplate(ctx context.Context, workspaceObj Template: *templateCopy, }, } - } diff --git a/pkg/tuning/preset-tuning.go b/pkg/tuning/preset-tuning.go index 0f201ea39..7b6744961 100644 --- a/pkg/tuning/preset-tuning.go +++ b/pkg/tuning/preset-tuning.go @@ -3,6 +3,7 @@ package tuning import ( "context" "fmt" + "k8s.io/utils/pointer" "os" "strings" @@ -67,11 +68,6 @@ func GetDataSrcImageInfo(ctx context.Context, wObj *kaitov1alpha1.Workspace) (st return wObj.Tuning.Input.Image, imagePullSecretRefs } -func GetDataDestImageInfo(ctx context.Context, wObj *kaitov1alpha1.Workspace) (string, []corev1.LocalObjectReference) { - imagePushSecretRefs := []corev1.LocalObjectReference{{Name: wObj.Tuning.Output.ImagePushSecret}} - return wObj.Tuning.Output.Image, imagePushSecretRefs -} - func EnsureTuningConfigMap(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, tuningObj *model.PresetParam, kubeClient client.Client) error { // Copy Configmap from helm chart configmap into workspace @@ -109,18 +105,55 @@ func EnsureTuningConfigMap(ctx context.Context, workspaceObj *kaitov1alpha1.Work return nil } -func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, - tuningObj *model.PresetParam, kubeClient client.Client) (client.Object, error) { - initContainers, imagePullSecrets, volumes, volumeMounts, err := prepareDataSource(ctx, workspaceObj, kubeClient) - if err != nil { - return nil, err - } +func dockerSidecarScriptPushImage(image string) string { + // TODO: Override output path if specified in trainingconfig (instead of /mnt/results) + return fmt.Sprintf(` +# Start the Docker daemon in the background with specific options for DinD +dockerd & +# Wait for the Docker daemon to be ready +while ! docker info > /dev/null 2>&1; do + echo "Waiting for Docker daemon to start..." + sleep 1 +done +echo 'Docker daemon started' - err = EnsureTuningConfigMap(ctx, workspaceObj, tuningObj, kubeClient) - if err != nil { - return nil, err - } +while true; do + FILE_PATH=$(find /mnt/results -name 'fine_tuning_completed.txt') + if [ ! -z "$FILE_PATH" ]; then + echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH" + + PARENT_DIR=$(dirname "$FILE_PATH") + echo "Parent directory is $PARENT_DIR" + + TEMP_CONTEXT=$(mktemp -d) + cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json" + cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors" + + # Create a minimal Dockerfile + echo 'FROM scratch + ADD adapter_config.json / + ADD adapter_model.safetensors /' > "$TEMP_CONTEXT/Dockerfile" + + docker build -t %s "$TEMP_CONTEXT" + docker push %s + # Cleanup: Remove the temporary directory + rm -rf "$TEMP_CONTEXT" + + # Remove the file to prevent repeated builds + rm "$FILE_PATH" + echo "Upload complete" + exit 0 + fi + sleep 10 # Check every 10 seconds +done`, image, image) +} + +func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.Volume, []corev1.VolumeMount) { + var volumes []corev1.Volume + var volumeMounts []corev1.VolumeMount + + // Add shared volume for shared memory (multi-node) shmVolume, shmVolumeMount := utils.ConfigSHMVolume(*workspaceObj.Resource.Count) if shmVolume.Name != "" { volumes = append(volumes, shmVolume) @@ -129,10 +162,55 @@ func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspa volumeMounts = append(volumeMounts, shmVolumeMount) } + // Add shared volume for tuning parameters cmVolume, cmVolumeMount := utils.ConfigCMVolume(workspaceObj.Tuning.ConfigTemplate) volumes = append(volumes, cmVolume) volumeMounts = append(volumeMounts, cmVolumeMount) + // Add shared volume for results dir + resultsVolume, resultsVolumeMount := utils.ConfigResultsVolume() + if resultsVolume.Name != "" { + volumes = append(volumes, resultsVolume) + } + if resultsVolumeMount.Name != "" { + volumeMounts = append(volumeMounts, resultsVolumeMount) + } + return volumes, volumeMounts +} + +func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, + tuningObj *model.PresetParam, kubeClient client.Client) (client.Object, error) { + var initContainers, sidecarContainers []corev1.Container + volumes, volumeMounts := setupDefaultSharedVolumes(workspaceObj) + + initContainer, imagePullSecrets, dataSourceVolume, dataSourceVolumeMount, err := prepareDataSource(ctx, workspaceObj) + if err != nil { + return nil, err + } + volumes = append(volumes, dataSourceVolume) + volumeMounts = append(volumeMounts, dataSourceVolumeMount) + if initContainer.Name != "" { + initContainers = append(initContainers, *initContainer) + } + + sidecarContainer, imagePushSecret, dataDestVolume, dataDestVolumeMount, err := prepareDataDestination(ctx, workspaceObj) + if err != nil { + return nil, err + } + volumes = append(volumes, dataDestVolume) + volumeMounts = append(volumeMounts, dataDestVolumeMount) + if sidecarContainer != nil { + sidecarContainers = append(sidecarContainers, *sidecarContainer) + } + if imagePushSecret != nil { + imagePullSecrets = append(imagePullSecrets, *imagePushSecret) + } + + err = EnsureTuningConfigMap(ctx, workspaceObj, tuningObj, kubeClient) + if err != nil { + return nil, err + } + modelCommand, err := prepareModelRunParameters(ctx, tuningObj) if err != nil { return nil, err @@ -141,7 +219,7 @@ func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspa tuningImage := GetTuningImageInfo(ctx, workspaceObj, tuningObj) jobObj := resources.GenerateTuningJobManifest(ctx, workspaceObj, tuningImage, imagePullSecrets, *workspaceObj.Resource.Count, commands, - containerPorts, nil, nil, resourceReq, tolerations, initContainers, volumes, volumeMounts) + containerPorts, nil, nil, resourceReq, tolerations, initContainers, sidecarContainers, volumes, volumeMounts) err = resources.CreateResource(ctx, jobObj, kubeClient) if client.IgnoreAlreadyExists(err) != nil { @@ -150,31 +228,63 @@ func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspa return jobObj, nil } +// Now there are two options for data destination 1. HostPath - 2. Image +func prepareDataDestination(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace) (*corev1.Container, *corev1.LocalObjectReference, corev1.Volume, corev1.VolumeMount, error) { + var sidecarContainer *corev1.Container + var volume corev1.Volume + var volumeMount corev1.VolumeMount + var imagePushSecret *corev1.LocalObjectReference + switch { + case workspaceObj.Tuning.Output.Image != "": + image, secret := workspaceObj.Tuning.Output.Image, workspaceObj.Tuning.Output.ImagePushSecret + imagePushSecret = &corev1.LocalObjectReference{Name: secret} + sidecarContainer, volume, volumeMount = handleImageDataDestination(ctx, image, secret) + // TODO: Future PR include + //case workspaceObj.Tuning.Output.Volume != nil: + } + return sidecarContainer, imagePushSecret, volume, volumeMount, nil +} + +func handleImageDataDestination(ctx context.Context, image, imagePushSecret string) (*corev1.Container, corev1.Volume, corev1.VolumeMount) { + sidecarContainer := &corev1.Container{ + Name: "docker-sidecar", + Image: "docker:dind", + SecurityContext: &corev1.SecurityContext{ + Privileged: pointer.BoolPtr(true), + }, + Command: []string{"/bin/sh", "-c"}, + Args: []string{dockerSidecarScriptPushImage(image)}, + } + + volume, volumeMount := utils.ConfigImagePushSecretVolume(imagePushSecret) + return sidecarContainer, volume, volumeMount +} + // Now there are three options for DataSource: 1. URL - 2. HostPath - 3. Image -func prepareDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, kubeClient client.Client) ([]corev1.Container, []corev1.LocalObjectReference, []corev1.Volume, []corev1.VolumeMount, error) { - var initContainers []corev1.Container - var volumes []corev1.Volume - var volumeMounts []corev1.VolumeMount +func prepareDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace) (*corev1.Container, []corev1.LocalObjectReference, corev1.Volume, corev1.VolumeMount, error) { + var initContainer *corev1.Container + var volume corev1.Volume + var volumeMount corev1.VolumeMount var imagePullSecrets []corev1.LocalObjectReference switch { case workspaceObj.Tuning.Input.Image != "": - initContainers, volumes, volumeMounts = handleImageDataSource(ctx, workspaceObj) - _, imagePullSecrets = GetDataSrcImageInfo(ctx, workspaceObj) + var image string + image, imagePullSecrets = GetDataSrcImageInfo(ctx, workspaceObj) + initContainer, volume, volumeMount = handleImageDataSource(ctx, image) case len(workspaceObj.Tuning.Input.URLs) > 0: - initContainers, volumes, volumeMounts = handleURLDataSource(ctx, workspaceObj) + initContainer, volume, volumeMount = handleURLDataSource(ctx, workspaceObj) // TODO: Future PR include // case workspaceObj.Tuning.Input.Volume != nil: } - return initContainers, imagePullSecrets, volumes, volumeMounts, nil + return initContainer, imagePullSecrets, volume, volumeMount, nil } -func handleImageDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace) ([]corev1.Container, []corev1.Volume, []corev1.VolumeMount) { - var initContainers []corev1.Container +func handleImageDataSource(ctx context.Context, image string) (*corev1.Container, corev1.Volume, corev1.VolumeMount) { // Constructing a multistep command that lists, copies, and then lists the destination command := "ls -la /data && cp -r /data/* " + utils.DefaultDataVolumePath + " && ls -la " + utils.DefaultDataVolumePath - initContainers = append(initContainers, corev1.Container{ + initContainer := &corev1.Container{ Name: "data-extractor", - Image: workspaceObj.Tuning.Input.Image, + Image: image, Command: []string{"sh", "-c", command}, VolumeMounts: []corev1.VolumeMount{ { @@ -182,15 +292,14 @@ func handleImageDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Work MountPath: utils.DefaultDataVolumePath, }, }, - }) + } - volumes, volumeMounts := utils.ConfigDataVolume("") - return initContainers, volumes, volumeMounts + volume, volumeMount := utils.ConfigDataVolume(nil) + return initContainer, volume, volumeMount } -func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace) ([]corev1.Container, []corev1.Volume, []corev1.VolumeMount) { - var initContainers []corev1.Container - initContainers = append(initContainers, corev1.Container{ +func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace) (*corev1.Container, corev1.Volume, corev1.VolumeMount) { + initContainer := &corev1.Container{ Name: "data-downloader", Image: "curlimages/curl", Command: []string{"sh", "-c", ` @@ -215,9 +324,9 @@ func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Worksp Value: utils.DefaultDataVolumePath, }, }, - }) - volumes, volumeMounts := utils.ConfigDataVolume("") - return initContainers, volumes, volumeMounts + } + volume, volumeMount := utils.ConfigDataVolume(nil) + return initContainer, volume, volumeMount } func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam) (string, error) { @@ -230,6 +339,9 @@ func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam // and sets the GPU resources required for tuning. // Returns the command and resource configuration. func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, modelCommand string, tuningObj *model.PresetParam) ([]string, corev1.ResourceRequirements) { + if tuningObj.TorchRunParams == nil { + tuningObj.TorchRunParams = make(map[string]string) + } // Set # of processes to GPU Count numProcesses := getInstanceGPUCount(wObj.Resource.InstanceType) tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) diff --git a/pkg/tuning/preset-tuning_test.go b/pkg/tuning/preset-tuning_test.go index 14ddaf5d2..d4e410a42 100644 --- a/pkg/tuning/preset-tuning_test.go +++ b/pkg/tuning/preset-tuning_test.go @@ -35,6 +35,24 @@ func normalize(s string) string { return strings.Join(strings.Fields(s), " ") } +// Saves state of current env, and returns function to restore to saved state +func saveEnv(key string) func() { + envVal, envExists := os.LookupEnv(key) + return func() { + if envExists { + err := os.Setenv(key, envVal) + if err != nil { + return + } + } else { + err := os.Unsetenv(key) + if err != nil { + return + } + } + } +} + func TestGetInstanceGPUCount(t *testing.T) { kaitov1alpha1.SupportedGPUConfigs = mockSupportedGPUConfigs testcases := map[string]struct { @@ -164,13 +182,16 @@ func TestGetDataSrcImageInfo(t *testing.T) { func TestEnsureTuningConfigMap(t *testing.T) { testcases := map[string]struct { + setupEnv func() callMocks func(c *test.MockClient) workspaceObj *kaitov1alpha1.Workspace expectedError string }{ "Config already exists in workspace namespace": { - callMocks: func(c *test.MockClient) { + setupEnv: func() { os.Setenv(consts.DefaultReleaseNamespaceEnvVar, "release-namespace") + }, + callMocks: func(c *test.MockClient) { c.On("Get", mock.IsType(context.Background()), mock.Anything, mock.IsType(&corev1.ConfigMap{}), mock.Anything).Return(nil) }, workspaceObj: &kaitov1alpha1.Workspace{ @@ -189,11 +210,13 @@ func TestEnsureTuningConfigMap(t *testing.T) { ConfigTemplate: "config-template", }, }, - expectedError: "failed to get ConfigMap from template namespace: \"config-template\" not found", + expectedError: "failed to get release namespace: failed to determine release namespace from file /var/run/secrets/kubernetes.io/serviceaccount/namespace and env var RELEASE_NAMESPACE", }, "Config doesn't exist in template namespace": { - callMocks: func(c *test.MockClient) { + setupEnv: func() { os.Setenv(consts.DefaultReleaseNamespaceEnvVar, "release-namespace") + }, + callMocks: func(c *test.MockClient) { c.On("Get", mock.IsType(context.Background()), mock.Anything, mock.IsType(&corev1.ConfigMap{}), mock.Anything).Return(errors.NewNotFound(schema.GroupResource{}, "config-template")) }, workspaceObj: &kaitov1alpha1.Workspace{ @@ -207,6 +230,12 @@ func TestEnsureTuningConfigMap(t *testing.T) { for name, tc := range testcases { t.Run(name, func(t *testing.T) { + cleanupEnv := saveEnv(consts.DefaultReleaseNamespaceEnvVar) + defer cleanupEnv() + + if tc.setupEnv != nil { + tc.setupEnv() + } mockClient := test.NewClient() tc.callMocks(mockClient) tc.workspaceObj.SetNamespace("workspace-namespace") @@ -247,18 +276,15 @@ func TestHandleImageDataSource(t *testing.T) { for name, tc := range testcases { t.Run(name, func(t *testing.T) { - initContainers, volumes, volumeMounts := handleImageDataSource(context.Background(), tc.workspaceObj) + initContainer, volume, volumeMount := handleImageDataSource(context.Background(), tc.workspaceObj.Tuning.Input.Image) - assert.Len(t, initContainers, 1) - assert.Equal(t, tc.expectedInitContainerName, initContainers[0].Name) - assert.Equal(t, tc.workspaceObj.Tuning.Input.Image, initContainers[0].Image) - assert.Contains(t, initContainers[0].Command[2], "cp -r /data/* /mnt/data") + assert.Equal(t, tc.expectedInitContainerName, initContainer.Name) + assert.Equal(t, tc.workspaceObj.Tuning.Input.Image, initContainer.Image) + assert.Contains(t, initContainer.Command[2], "cp -r /data/* /mnt/data") - assert.Len(t, volumes, 1) - assert.Equal(t, tc.expectedVolumeName, volumes[0].Name) + assert.Equal(t, tc.expectedVolumeName, volume.Name) - assert.Len(t, volumeMounts, 1) - assert.Equal(t, tc.expectedVolumeMountPath, volumeMounts[0].MountPath) + assert.Equal(t, tc.expectedVolumeMountPath, volumeMount.MountPath) }) } } @@ -290,18 +316,15 @@ func TestHandleURLDataSource(t *testing.T) { for name, tc := range testcases { t.Run(name, func(t *testing.T) { - initContainers, volumes, volumeMounts := handleURLDataSource(context.Background(), tc.workspaceObj) + initContainer, volume, volumeMount := handleURLDataSource(context.Background(), tc.workspaceObj) - assert.Len(t, initContainers, 1) - assert.Equal(t, tc.expectedInitContainerName, initContainers[0].Name) - assert.Equal(t, tc.expectedImage, initContainers[0].Image) - assert.Contains(t, normalize(initContainers[0].Command[2]), normalize(tc.expectedCommands)) + assert.Equal(t, tc.expectedInitContainerName, initContainer.Name) + assert.Equal(t, tc.expectedImage, initContainer.Image) + assert.Contains(t, normalize(initContainer.Command[2]), normalize(tc.expectedCommands)) - assert.Len(t, volumes, 1) - assert.Equal(t, tc.expectedVolumeName, volumes[0].Name) + assert.Equal(t, tc.expectedVolumeName, volume.Name) - assert.Len(t, volumeMounts, 1) - assert.Equal(t, tc.expectedVolumeMountPath, volumeMounts[0].MountPath) + assert.Equal(t, tc.expectedVolumeMountPath, volumeMount.MountPath) }) } } @@ -364,31 +387,28 @@ func TestPrepareDataSource_ImageSource(t *testing.T) { } // Expected outputs from mocked functions - expectedVolumes := []corev1.Volume{ - { - Name: "data-volume", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, // Assume we expect an EmptyDir - }, + expectedVolume := corev1.Volume{ + Name: "data-volume", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, // Assume we expect an EmptyDir }, } - expectedVolumeMounts := []corev1.VolumeMount{{Name: "data-volume", MountPath: "/mnt/data"}} + + expectedVolumeMount := corev1.VolumeMount{Name: "data-volume", MountPath: "/mnt/data"} expectedImagePullSecrets := []corev1.LocalObjectReference{} - expectedInitContainers := []corev1.Container{ - { - Name: "data-extractor", - Image: "custom/data-loader-image", - Command: []string{"sh", "-c", "ls -la /data && cp -r /data/* /mnt/data && ls -la /mnt/data"}, - VolumeMounts: expectedVolumeMounts, - }, + expectedInitContainer := &corev1.Container{ + Name: "data-extractor", + Image: "custom/data-loader-image", + Command: []string{"sh", "-c", "ls -la /data && cp -r /data/* /mnt/data && ls -la /mnt/data"}, + VolumeMounts: []corev1.VolumeMount{expectedVolumeMount}, } - initContainers, imagePullSecrets, volumes, volumeMounts, err := prepareDataSource(ctx, workspaceObj, nil) + initContainer, imagePullSecrets, volume, volumeMount, err := prepareDataSource(ctx, workspaceObj) // Assertions assert.NoError(t, err) - assert.Equal(t, expectedInitContainers, initContainers) - assert.Equal(t, expectedVolumes, volumes) - assert.Equal(t, expectedVolumeMounts, volumeMounts) + assert.Equal(t, expectedInitContainer, initContainer) + assert.Equal(t, expectedVolume, volume) + assert.Equal(t, expectedVolumeMount, volumeMount) assert.Equal(t, expectedImagePullSecrets, imagePullSecrets) } diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go index 87363821b..61295dcde 100644 --- a/pkg/utils/common-preset.go +++ b/pkg/utils/common-preset.go @@ -10,8 +10,49 @@ const ( DefaultVolumeMountPath = "/dev/shm" DefaultConfigMapMountPath = "/mnt/config" DefaultDataVolumePath = "/mnt/data" + DefaultResultsVolumePath = "/mnt/results" ) +func ConfigResultsVolume() (corev1.Volume, corev1.VolumeMount) { + sharedWorkspaceVolume := corev1.Volume{ + Name: "results-volume", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + } + sharedVolumeMount := corev1.VolumeMount{ + Name: "results-volume", + // TODO: Override output path if specified in trainingconfig + MountPath: DefaultResultsVolumePath, + } + return sharedWorkspaceVolume, sharedVolumeMount +} + +func ConfigImagePushSecretVolume(imagePushSecret string) (corev1.Volume, corev1.VolumeMount) { + volume := corev1.Volume{ + Name: "docker-config", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: imagePushSecret, + Items: []corev1.KeyToPath{ + { + Key: ".dockerconfigjson", + Path: "config.json", + }, + }, + }, + }, + } + + volumeMount := corev1.VolumeMount{ + Name: "docker-config", + MountPath: "/root/.docker/config.json", + SubPath: "config.json", // Mount only the config.json file + } + + return volume, volumeMount +} + func ConfigSHMVolume(instanceCount int) (corev1.Volume, corev1.VolumeMount) { volume := corev1.Volume{} volumeMount := corev1.VolumeMount{} @@ -56,14 +97,14 @@ func ConfigCMVolume(cmName string) (corev1.Volume, corev1.VolumeMount) { return volume, volumeMount } -func ConfigDataVolume(hostPath string) ([]corev1.Volume, []corev1.VolumeMount) { - var volumes []corev1.Volume - var volumeMounts []corev1.VolumeMount +func ConfigDataVolume(hostPath *string) (corev1.Volume, corev1.VolumeMount) { + var volume corev1.Volume + var volumeMount corev1.VolumeMount var volumeSource corev1.VolumeSource - if hostPath != "" { + if hostPath != nil { volumeSource = corev1.VolumeSource{ HostPath: &corev1.HostPathVolumeSource{ - Path: hostPath, + Path: *hostPath, }, } } else { @@ -71,14 +112,14 @@ func ConfigDataVolume(hostPath string) ([]corev1.Volume, []corev1.VolumeMount) { EmptyDir: &corev1.EmptyDirVolumeSource{}, } } - volumes = append(volumes, corev1.Volume{ + volume = corev1.Volume{ Name: "data-volume", VolumeSource: volumeSource, - }) + } - volumeMounts = append(volumeMounts, corev1.VolumeMount{ + volumeMount = corev1.VolumeMount{ Name: "data-volume", MountPath: DefaultDataVolumePath, - }) - return volumes, volumeMounts + } + return volume, volumeMount } diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go index bc7f882af..018dd5265 100644 --- a/presets/models/falcon/model.go +++ b/presets/models/falcon/model.go @@ -3,6 +3,7 @@ package falcon import ( + "github.com/azure/kaito/pkg/tuning" "time" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" @@ -77,7 +78,7 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - //TorchRunParams: tuning.DefaultAccelerateParams, // TODO + TorchRunParams: tuning.DefaultAccelerateParams, //ModelRunPrams: falconRunTuningParams, // TODO ReadinessTimeout: time.Duration(30) * time.Minute, BaseCommand: baseCommandPresetFalcon, @@ -150,7 +151,7 @@ func (*falcon40b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "16Gi", - //TorchRunParams: tuning.DefaultAccelerateParams, // TODO + TorchRunParams: tuning.DefaultAccelerateParams, //ModelRunPrams: falconRunTuningParams, // TODO ReadinessTimeout: time.Duration(30) * time.Minute, BaseCommand: baseCommandPresetFalcon,