Skip to content

Commit

Permalink
feat: Make sure machine name is unique (#84)
Browse files Browse the repository at this point in the history
Co-authored-by: guofei <guofei@microsoft.com>
  • Loading branch information
Fei-Guo and Fei-Guo authored Oct 18, 2023
1 parent b8ab612 commit ce26b36
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 14 deletions.
4 changes: 2 additions & 2 deletions api/v1alpha1/workspace_labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ const (
// Non-prefixed labels/annotations are reserved for end-use.

// KAITOPrefix Kubernetes Data Mining prefix.
KAITOPrefix = "kubernetes-kaito.sh/"
KAITOPrefix = "kaito.sh/"

// AnnotationServiceType determines whether kaito creates ClusterIP or LoadBalancer type service.
AnnotationServiceType = KAITOPrefix + "service-type"

// LabelWorkspaceName is the label for workspace name.
LabelWorkspaceName = KAITOPrefix + "workspace-name"
LabelWorkspaceName = KAITOPrefix + "workspace"

ServiceTypeClusterIP = "cluster-ip"
ServiceTypeLoadBalancer = "load-balancer"
Expand Down
35 changes: 25 additions & 10 deletions pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -277,17 +278,31 @@ func (c *WorkspaceReconciler) validateNodeInstanceType(ctx context.Context, wObj
// createAndValidateNode creates a new machine and validates status.
func (c *WorkspaceReconciler) createAndValidateNode(ctx context.Context, wObj *kaitov1alpha1.Workspace) (*corev1.Node, error) {
klog.InfoS("createAndValidateNode", "workspace", klog.KObj(wObj))
var machineOSDiskSize string
if wObj.Inference.Preset.Name != "" {
machineOSDiskSize = inference.Llama2PresetInferences[wObj.Inference.Preset.Name].DiskStorageRequirement
}
if machineOSDiskSize == "" {
machineOSDiskSize = "0" // The default OS size is used
}

newMachine := machine.GenerateMachineManifest(ctx, inference.Llama2PresetInferences[wObj.Inference.Preset.Name].DiskStorageRequirement, wObj)
Retry_withdifferentname:
newMachine := machine.GenerateMachineManifest(ctx, machineOSDiskSize, wObj)

if err := machine.CreateMachine(ctx, newMachine, c.Client); err != nil {
klog.ErrorS(err, "failed to create machine", "machine", newMachine.Name)
if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeMachineStatus, metav1.ConditionFalse,
"machineFailedCreation", err.Error()); err != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", wObj)
if apierrors.IsAlreadyExists(err) {
klog.InfoS("There exists a machine with the same name, retry with a different name", "machine", newMachine.Name)
goto Retry_withdifferentname
} else {

klog.ErrorS(err, "failed to create machine", "machine", newMachine.Name)
if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeMachineStatus, metav1.ConditionFalse,
"machineFailedCreation", err.Error()); err != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", wObj)
return nil, err
}
return nil, err
}
return nil, err
}
klog.InfoS("a new machine has been created", "machine", newMachine.Name)

Expand Down Expand Up @@ -318,15 +333,15 @@ func (c *WorkspaceReconciler) ensureNodePlugins(ctx context.Context, wObj *kaito
return ctx.Err()
default:
if nodeObj == nil {
return errors.NewNotFound(core.Resource("nodes"), nodeObj.Name)
return apierrors.NewNotFound(core.Resource("nodes"), nodeObj.Name)
}

//Nvidia Plugin
foundNvidiaPlugin = k8sresources.CheckNvidiaPlugin(ctx, nodeObj)
if !foundNvidiaPlugin {
err := k8sresources.UpdateNodeWithLabel(ctx, nodeObj.Name, k8sresources.LabelKeyNvidia, k8sresources.LabelValueNvidia, c.Client)
if err != nil {
if errors.IsNotFound(err) {
if apierrors.IsNotFound(err) {
klog.ErrorS(err, "nvidia plugin cannot be installed, node not found", "node", nodeObj.Name)
if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeMachineStatus, metav1.ConditionFalse,
"checkMachineStatusFailed", err.Error()); err != nil {
Expand Down Expand Up @@ -363,7 +378,7 @@ func (c *WorkspaceReconciler) applyAnnotations(ctx context.Context, wObj *kaitov
existingSVC := &corev1.Service{}
err := k8sresources.GetResource(ctx, wObj.Name, wObj.Namespace, c.Client, existingSVC)
if err != nil {
if !errors.IsNotFound(err) {
if !apierrors.IsNotFound(err) {
return err
}
} else {
Expand All @@ -388,7 +403,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a
existingObj := &appsv1.StatefulSet{}
err := k8sresources.GetResource(ctx, wObj.Name, wObj.Namespace, c.Client, existingObj)
if err != nil {
if !errors.IsNotFound(err) {
if !apierrors.IsNotFound(err) {
if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeInferenceStatus, metav1.ConditionFalse,
"WorkspaceInferenceStatusFailed", err.Error()); err != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", wObj)
Expand Down
7 changes: 5 additions & 2 deletions pkg/machine/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ package machine

import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"math/rand"
"time"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
Expand Down Expand Up @@ -36,14 +37,16 @@ var (
func GenerateMachineManifest(ctx context.Context, storageRequirement string, workspaceObj *kaitov1alpha1.Workspace) *v1alpha5.Machine {
klog.InfoS("GenerateMachineManifest", "workspace", klog.KObj(workspaceObj))

machineName := fmt.Sprint("machine", rand.Intn(100_000))
digest := sha256.Sum256([]byte(workspaceObj.Namespace + workspaceObj.Name + time.Now().Format("2006-01-02 15:04:05.000000000"))) // We make sure the machine name is not fixed to the a workspace
machineName := "ws" + hex.EncodeToString(digest[0:])[0:9]
machineLabels := map[string]string{
LabelProvisionerName: ProvisionerName,
kaitov1alpha1.LabelWorkspaceName: workspaceObj.Name,
}
if workspaceObj.Resource.LabelSelector != nil &&
len(workspaceObj.Resource.LabelSelector.MatchLabels) != 0 {
machineLabels = lo.Assign(machineLabels, workspaceObj.Resource.LabelSelector.MatchLabels)

}

return &v1alpha5.Machine{
Expand Down

0 comments on commit ce26b36

Please sign in to comment.