diff --git a/api/v1alpha1/workspace_labels.go b/api/v1alpha1/workspace_labels.go index bcd8be127..d9709d057 100644 --- a/api/v1alpha1/workspace_labels.go +++ b/api/v1alpha1/workspace_labels.go @@ -5,13 +5,13 @@ const ( // Non-prefixed labels/annotations are reserved for end-use. // KAITOPrefix Kubernetes Data Mining prefix. - KAITOPrefix = "kubernetes-kaito.sh/" + KAITOPrefix = "kaito.sh/" // AnnotationServiceType determines whether kaito creates ClusterIP or LoadBalancer type service. AnnotationServiceType = KAITOPrefix + "service-type" // LabelWorkspaceName is the label for workspace name. - LabelWorkspaceName = KAITOPrefix + "workspace-name" + LabelWorkspaceName = KAITOPrefix + "workspace" ServiceTypeClusterIP = "cluster-ip" ServiceTypeLoadBalancer = "load-balancer" diff --git a/pkg/controllers/workspace_controller.go b/pkg/controllers/workspace_controller.go index d1e5bbaa8..6038e62af 100644 --- a/pkg/controllers/workspace_controller.go +++ b/pkg/controllers/workspace_controller.go @@ -16,6 +16,7 @@ import ( "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -277,17 +278,31 @@ func (c *WorkspaceReconciler) validateNodeInstanceType(ctx context.Context, wObj // createAndValidateNode creates a new machine and validates status. func (c *WorkspaceReconciler) createAndValidateNode(ctx context.Context, wObj *kaitov1alpha1.Workspace) (*corev1.Node, error) { klog.InfoS("createAndValidateNode", "workspace", klog.KObj(wObj)) + var machineOSDiskSize string + if wObj.Inference.Preset.Name != "" { + machineOSDiskSize = inference.Llama2PresetInferences[wObj.Inference.Preset.Name].DiskStorageRequirement + } + if machineOSDiskSize == "" { + machineOSDiskSize = "0" // The default OS size is used + } - newMachine := machine.GenerateMachineManifest(ctx, inference.Llama2PresetInferences[wObj.Inference.Preset.Name].DiskStorageRequirement, wObj) +Retry_withdifferentname: + newMachine := machine.GenerateMachineManifest(ctx, machineOSDiskSize, wObj) if err := machine.CreateMachine(ctx, newMachine, c.Client); err != nil { - klog.ErrorS(err, "failed to create machine", "machine", newMachine.Name) - if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeMachineStatus, metav1.ConditionFalse, - "machineFailedCreation", err.Error()); err != nil { - klog.ErrorS(err, "failed to update workspace status", "workspace", wObj) + if apierrors.IsAlreadyExists(err) { + klog.InfoS("There exists a machine with the same name, retry with a different name", "machine", newMachine.Name) + goto Retry_withdifferentname + } else { + + klog.ErrorS(err, "failed to create machine", "machine", newMachine.Name) + if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeMachineStatus, metav1.ConditionFalse, + "machineFailedCreation", err.Error()); err != nil { + klog.ErrorS(err, "failed to update workspace status", "workspace", wObj) + return nil, err + } return nil, err } - return nil, err } klog.InfoS("a new machine has been created", "machine", newMachine.Name) @@ -318,7 +333,7 @@ func (c *WorkspaceReconciler) ensureNodePlugins(ctx context.Context, wObj *kaito return ctx.Err() default: if nodeObj == nil { - return errors.NewNotFound(core.Resource("nodes"), nodeObj.Name) + return apierrors.NewNotFound(core.Resource("nodes"), nodeObj.Name) } //Nvidia Plugin @@ -326,7 +341,7 @@ func (c *WorkspaceReconciler) ensureNodePlugins(ctx context.Context, wObj *kaito if !foundNvidiaPlugin { err := k8sresources.UpdateNodeWithLabel(ctx, nodeObj.Name, k8sresources.LabelKeyNvidia, k8sresources.LabelValueNvidia, c.Client) if err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { klog.ErrorS(err, "nvidia plugin cannot be installed, node not found", "node", nodeObj.Name) if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeMachineStatus, metav1.ConditionFalse, "checkMachineStatusFailed", err.Error()); err != nil { @@ -363,7 +378,7 @@ func (c *WorkspaceReconciler) applyAnnotations(ctx context.Context, wObj *kaitov existingSVC := &corev1.Service{} err := k8sresources.GetResource(ctx, wObj.Name, wObj.Namespace, c.Client, existingSVC) if err != nil { - if !errors.IsNotFound(err) { + if !apierrors.IsNotFound(err) { return err } } else { @@ -388,7 +403,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a existingObj := &appsv1.StatefulSet{} err := k8sresources.GetResource(ctx, wObj.Name, wObj.Namespace, c.Client, existingObj) if err != nil { - if !errors.IsNotFound(err) { + if !apierrors.IsNotFound(err) { if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeInferenceStatus, metav1.ConditionFalse, "WorkspaceInferenceStatusFailed", err.Error()); err != nil { klog.ErrorS(err, "failed to update workspace status", "workspace", wObj) diff --git a/pkg/machine/machine.go b/pkg/machine/machine.go index 6122d6c90..c409138d0 100644 --- a/pkg/machine/machine.go +++ b/pkg/machine/machine.go @@ -2,8 +2,9 @@ package machine import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" - "math/rand" "time" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" @@ -36,7 +37,8 @@ var ( func GenerateMachineManifest(ctx context.Context, storageRequirement string, workspaceObj *kaitov1alpha1.Workspace) *v1alpha5.Machine { klog.InfoS("GenerateMachineManifest", "workspace", klog.KObj(workspaceObj)) - machineName := fmt.Sprint("machine", rand.Intn(100_000)) + digest := sha256.Sum256([]byte(workspaceObj.Namespace + workspaceObj.Name + time.Now().Format("2006-01-02 15:04:05.000000000"))) // We make sure the machine name is not fixed to the a workspace + machineName := "ws" + hex.EncodeToString(digest[0:])[0:9] machineLabels := map[string]string{ LabelProvisionerName: ProvisionerName, kaitov1alpha1.LabelWorkspaceName: workspaceObj.Name, @@ -44,6 +46,7 @@ func GenerateMachineManifest(ctx context.Context, storageRequirement string, wor if workspaceObj.Resource.LabelSelector != nil && len(workspaceObj.Resource.LabelSelector.MatchLabels) != 0 { machineLabels = lo.Assign(machineLabels, workspaceObj.Resource.LabelSelector.MatchLabels) + } return &v1alpha5.Machine{