Skip to content

Commit

Permalink
feat: sidecar script
Browse files Browse the repository at this point in the history
  • Loading branch information
ishaansehgal99 committed Apr 4, 2024
1 parent 5652d19 commit 53394ee
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
47 changes: 45 additions & 2 deletions pkg/resources/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,50 @@ func GenerateStatefulSetManifest(ctx context.Context, workspaceObj *kaitov1alpha
}

func dockerSidecarScript() string {
return `# docker-sidecar script here...`
return `
# Start the Docker daemon in the background with specific options for DinD
dockerd &
# Wait for the Docker daemon to be ready
while ! docker info > /dev/null 2>&1; do
echo "Waiting for Docker daemon to start..."
sleep 1
done
echo 'Docker daemon started'
while true; do
FILE_PATH=$(find /workspace/tfs -name 'fine_tuning_completed.txt')
if [ ! -z "$FILE_PATH" ]; then
echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH"
PARENT_DIR=$(dirname "$FILE_PATH")
echo "Parent directory is $PARENT_DIR"
TEMP_CONTEXT=$(mktemp -d)
cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json"
cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors"
# Create a minimal Dockerfile
echo 'FROM scratch
ADD adapter_config.json /
ADD adapter_model.safetensors /' > "$TEMP_CONTEXT/Dockerfile"
# Login to Docker registry
echo $ACR_PASSWORD | docker login $ACR_USERNAME.azurecr.io -u $ACR_USERNAME --password-stdin
docker build -t $ACR_USERNAME.azurecr.io/adapter-falcon-7b:$TAG "$TEMP_CONTEXT"
docker push $ACR_USERNAME.azurecr.io/adapter-falcon-7b:$TAG
# Cleanup: Remove the temporary directory
rm -rf "$TEMP_CONTEXT"
# Remove the file to prevent repeated builds, or handle as needed
# rm "$FILE_PATH"
echo "Upload complete"
exit 0
fi
sleep 10 # Check every 10 seconds
done
`
}

func GenerateTuningJobManifest(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, imagePullSecretRefs []corev1.LocalObjectReference,
Expand Down Expand Up @@ -240,7 +283,7 @@ func GenerateTuningJobManifest(ctx context.Context, workspaceObj *kaitov1alpha1.
},
VolumeMounts: volumeMounts,
Command: []string{"/bin/sh", "-c"},
Args: []string{"docker-sidecar script here..."}, // Placeholder for the actual script
Args: []string{dockerSidecarScript()},
},
},
RestartPolicy: corev1.RestartPolicyNever,
Expand Down
4 changes: 2 additions & 2 deletions pkg/tuning/preset-tuning.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ func getDataDestination(ctx context.Context, workspaceObj *kaitov1alpha1.Workspa
// Returns the command and resource configuration.
func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, tuningObj *model.PresetParam) ([]string, corev1.ResourceRequirements) {
// Set # of processes to GPU Count
num_processes := utils.GetInstanceGPUCount(wObj)
tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", num_processes)
numProcesses := utils.GetInstanceGPUCount(wObj)
tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses)
torchCommand := utils.BuildCmdStr(tuningObj.BaseCommand, tuningObj.TorchRunParams)
torchCommand = utils.BuildCmdStr(torchCommand, tuningObj.TorchRunRdzvParams)
modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.ModelRunParams)
Expand Down

0 comments on commit 53394ee

Please sign in to comment.