kaito-project · ishaansehgal99 · May 28, 2024 · May 3, 2024 · May 3, 2024 · May 3, 2024
@@ -336,12 +336,12 @@ func (i *InferenceSpec) validateCreate() (errs *apis.FieldError) {
 			errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported inference preset name %s", presetName), "presetName"))
 		}
 		// Validate private preset has private image specified
-		if plugin.KaitoModelRegister.MustGet(string(i.Preset.Name)).GetInferenceParameters().ImageAccessMode == "private" &&
-			i.Preset.PresetMeta.AccessMode != "private" {
+		if plugin.KaitoModelRegister.MustGet(string(i.Preset.Name)).GetInferenceParameters().ImageAccessMode == string(ModelImageAccessModePrivate) &&
+			i.Preset.PresetMeta.AccessMode != ModelImageAccessModePrivate {
 			errs = errs.Also(apis.ErrGeneric("This preset only supports private AccessMode, AccessMode must be private to continue"))
 		}
 		// Additional validations for Preset
-		if i.Preset.PresetMeta.AccessMode == "private" && i.Preset.PresetOptions.Image == "" {
+		if i.Preset.PresetMeta.AccessMode == ModelImageAccessModePrivate && i.Preset.PresetOptions.Image == "" {
 			errs = errs.Also(apis.ErrGeneric("When AccessMode is private, an image must be provided in PresetOptions"))
 		}
 		// Note: we don't enforce private access mode to have image secrets, in case anonymous pulling is enabled

@@ -55,15 +55,15 @@ type testModelPrivate struct{}
 
 func (*testModelPrivate) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
-		ImageAccessMode:           "private",
+		ImageAccessMode:           string(ModelImageAccessModePrivate),
 		GPUCountRequirement:       gpuCountRequirement,
 		TotalGPUMemoryRequirement: totalGPUMemoryRequirement,
 		PerGPUMemoryRequirement:   perGPUMemoryRequirement,
 	}
 }
 func (*testModelPrivate) GetTuningParameters() *model.PresetParam {
 	return &model.PresetParam{
-		ImageAccessMode:           "private",
+		ImageAccessMode:           string(ModelImageAccessModePrivate),
 		GPUCountRequirement:       gpuCountRequirement,
 		TotalGPUMemoryRequirement: totalGPUMemoryRequirement,
 		PerGPUMemoryRequirement:   perGPUMemoryRequirement,
@@ -461,7 +461,7 @@ func TestInferenceSpecValidateCreate(t *testing.T) {
 				Preset: &PresetSpec{
 					PresetMeta: PresetMeta{
 						Name:       ModelName("test-validation"),
-						AccessMode: "private",
+						AccessMode: ModelImageAccessModePrivate,
 					},
 					PresetOptions: PresetOptions{},
 				},
@@ -488,7 +488,7 @@ func TestInferenceSpecValidateCreate(t *testing.T) {
 				Preset: &PresetSpec{
 					PresetMeta: PresetMeta{
 						Name:       ModelName("test-validation"),
-						AccessMode: "public",
+						AccessMode: ModelImageAccessModePublic,
 					},
 				},
 			},

@@ -26,7 +26,7 @@ data:
         bias: "none"
 
       TrainingArguments:
-        output_dir: "."
+        output_dir: "/mnt/results"
         num_train_epochs: 4
         auto_find_batch_size: true
         ddp_find_unused_parameters: false

@@ -29,7 +29,7 @@ data:
         bias: "none"
 
       TrainingArguments:
-        output_dir: "."
+        output_dir: "/mnt/results"
         num_train_epochs: 4
         auto_find_batch_size: true
         ddp_find_unused_parameters: false

@@ -13,11 +13,20 @@ RUN echo $VERSION > /workspace/tfs/version.txt
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
 # avoid reinstalling dependencies unless the requirements file changes.
-COPY kaito/presets/tuning/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
+# Inference
+COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/inference-requirements.txt
+RUN pip install --no-cache-dir -r inference-requirements.txt
+
+COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
+
+# Fine Tuning
+COPY kaito/presets/tuning/${MODEL_TYPE}/requirements.txt /workspace/tfs/tuning-requirements.txt
+RUN pip install --no-cache-dir -r tuning-requirements.txt
 
 COPY kaito/presets/tuning/${MODEL_TYPE}/cli.py /workspace/tfs/cli.py
-COPY kaito/presets/tuning/${MODEL_TYPE}/fine_tuning_api.py /workspace/tfs/tuning_api.py
+COPY kaito/presets/tuning/${MODEL_TYPE}/fine_tuning_api.py /workspace/tfs/fine_tuning_api.py
+COPY kaito/presets/tuning/${MODEL_TYPE}/parser.py /workspace/tfs/parser.py
+COPY kaito/presets/tuning/${MODEL_TYPE}/dataset.py /workspace/tfs/dataset.py
 
 # Copy the entire model weights to the weights directory
 COPY ${WEIGHTS_PATH} /workspace/tfs/weights
@@ -94,7 +94,7 @@ func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient cl
 
 func GetInferenceImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, presetObj *model.PresetParam) (string, []corev1.LocalObjectReference) {
 	imagePullSecretRefs := []corev1.LocalObjectReference{}
-	if presetObj.ImageAccessMode == "private" {
+	if presetObj.ImageAccessMode == string(kaitov1alpha1.ModelImageAccessModePrivate) {
 		imageName := workspaceObj.Inference.Preset.PresetOptions.Image
 		for _, secretName := range workspaceObj.Inference.Preset.PresetOptions.ImagePullSecrets {
 			imagePullSecretRefs = append(imagePullSecretRefs, corev1.LocalObjectReference{Name: secretName})

@@ -55,9 +55,21 @@ func getInstanceGPUCount(sku string) int {
 	return gpuConfig.GPUCount
 }
 
-func GetTuningImageInfo(ctx context.Context, wObj *kaitov1alpha1.Workspace, presetObj *model.PresetParam) string {
-	registryName := os.Getenv("PRESET_REGISTRY_NAME")
-	return fmt.Sprintf("%s/%s:%s", registryName, "kaito-tuning-"+string(wObj.Tuning.Preset.Name), presetObj.Tag)
+func GetTuningImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, presetObj *model.PresetParam) (string, []corev1.LocalObjectReference) {
+	imagePullSecretRefs := []corev1.LocalObjectReference{}
+	if presetObj.ImageAccessMode == string(kaitov1alpha1.ModelImageAccessModePrivate) {
+		imageName := workspaceObj.Tuning.Preset.PresetOptions.Image
+		for _, secretName := range workspaceObj.Tuning.Preset.PresetOptions.ImagePullSecrets {
+			imagePullSecretRefs = append(imagePullSecretRefs, corev1.LocalObjectReference{Name: secretName})
+		}
+		return imageName, imagePullSecretRefs
+	} else {
+		imageName := string(workspaceObj.Tuning.Preset.Name)
+		imageTag := presetObj.Tag
+		registryName := os.Getenv("PRESET_REGISTRY_NAME")
+		imageName = fmt.Sprintf("%s/kaito-%s:%s", registryName, imageName, imageTag)
+		return imageName, imagePullSecretRefs
+	}
 }
 
 func GetDataSrcImageInfo(ctx context.Context, wObj *kaitov1alpha1.Workspace) (string, []corev1.LocalObjectReference) {
@@ -216,7 +228,10 @@ func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspa
 		return nil, err
 	}
 	commands, resourceReq := prepareTuningParameters(ctx, workspaceObj, modelCommand, tuningObj)
-	tuningImage := GetTuningImageInfo(ctx, workspaceObj, tuningObj)
+	tuningImage, tuningImagePullSecrets := GetTuningImageInfo(ctx, workspaceObj, tuningObj)
+	if tuningImagePullSecrets != nil {
+		imagePullSecrets = append(imagePullSecrets, tuningImagePullSecrets...)
+	}
 
 	jobObj := resources.GenerateTuningJobManifest(ctx, workspaceObj, tuningImage, imagePullSecrets, *workspaceObj.Resource.Count, commands,
 		containerPorts, nil, nil, resourceReq, tolerations, initContainers, sidecarContainers, volumes, volumeMounts)

@@ -108,7 +108,7 @@ func TestGetTuningImageInfo(t *testing.T) {
 			presetObj: &model.PresetParam{
 				Tag: "latest",
 			},
-			expected: "testregistry/kaito-tuning-testpreset:latest",
+			expected: "testregistry/kaito-testpreset:latest",
 		},
 		"Empty Registry Name": {
 			registryName: "",
@@ -124,14 +124,14 @@ func TestGetTuningImageInfo(t *testing.T) {
 			presetObj: &model.PresetParam{
 				Tag: "latest",
 			},
-			expected: "/kaito-tuning-testpreset:latest",
+			expected: "/kaito-testpreset:latest",
 		},
 	}
 
 	for name, tc := range testcases {
 		t.Run(name, func(t *testing.T) {
 			os.Setenv("PRESET_REGISTRY_NAME", tc.registryName)
-			result := GetTuningImageInfo(context.Background(), tc.wObj, tc.presetObj)
+			result, _ := GetTuningImageInfo(context.Background(), tc.wObj, tc.presetObj)
 			assert.Equal(t, tc.expected, result)
 		})
 	}

@@ -3,13 +3,27 @@
 import os
 from dataclasses import dataclass, field
 from datetime import datetime
+from enum import Enum, auto
 from typing import Any, Dict, List, Optional
 
 import torch
 from peft import LoraConfig
 from transformers import (BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedTokenizer, TrainerCallback)
+                          PreTrainedTokenizer)
 
+# Consider Future Support for other trainers
+# class TrainerTypes(Enum):
+    # TRAINER = "Trainer"
+    # SFT_TRAINER = "SFTTrainer"
+    # DPO_TRAINER = "DPOTrainer"
+    # REWARD_TRAINER = "RewardTrainer"
+    # PPO_TRAINER = "PPOTrainer"
+    # CPO_TRAINER = "CPOTrainer"
+    # ORPO_TRAINER = "ORPOTrainer"
+
+# @dataclass
+# class TrainerType:
+#     trainer_type: TrainerTypes = field(default=TrainerTypes.SFT_TRAINER, metadata={"help": "Type of trainer to use for fine-tuning."})
 
 @dataclass
 class ExtDataCollator(DataCollatorForLanguageModeling):
@@ -24,33 +38,36 @@ class ExtLoraConfig(LoraConfig):
     target_modules: Optional[List[str]] = field(default=None, metadata={"help": ("List of module names to replace with LoRA.")})
     layers_to_transform: Optional[List[int]] = field(default=None, metadata={"help": "Layer indices to apply LoRA"})
     layers_pattern: Optional[List[str]] = field(default=None, metadata={"help": "Pattern to match layers for LoRA"})
-    loftq_config: Dict[str, any] = field(default_factory=dict, metadata={"help": "LoftQ configuration for quantization"}) 
+    loftq_config: Dict[str, any] = field(default_factory=dict, metadata={"help": "LoftQ configuration for quantization"})
 
-@dataclass 
-class DatasetConfig: 
+@dataclass
+class DatasetConfig:
     """
-    Config for Dataset 
+    Config for Dataset
     """
-    dataset_name: str = field(metadata={"help": "Name of Dataset"})
+    dataset_path: Optional[str] = field(default=None, metadata={"help": "Where dataset file is located in the /data folder. This path will be appended to /data. This path should be the absolute path in the image or host."})
+    dataset_extension: Optional[str] = field(default=None, metadata={"help": "Optional explicit file extension of the dataset. If not provided, the extension will be derived from the dataset_path."})
     shuffle_dataset: bool = field(default=True, metadata={"help": "Whether to shuffle dataset"})
     shuffle_seed: int = field(default=42, metadata={"help": "Seed for shuffling data"})
-    context_column: str = field(default="Context", metadata={"help": "Example human input column in the dataset"})
-    response_column: str = field(default="Response", metadata={"help": "Example bot response output column in the dataset"})
+    # instruction_column: Optional[str] = field(default=None, metadata={"help": "Optional column for detailed instructions, used in more structured tasks like Alpaca-style setups."}) # Consider including in V2
+    context_column: Optional[str] = field(default=None, metadata={"help": "Column for additional context or prompts, used for generating responses based on scenarios."})
+    response_column: str = field(default="text", metadata={"help": "Main text column for standalone entries or the response part in prompt-response setups."})
+    messages_column: Optional[str] = field(default=None, metadata={"help": "Column containing structured conversational data in JSON format with roles and content, used for chatbot training."})
     train_test_split: float = field(default=0.8, metadata={"help": "Split between test and training data (e.g. 0.8 means 80/20% train/test split)"})
 
 @dataclass
 class TokenizerParams:
     """
-    Tokenizer params 
+    Tokenizer params
     """
     add_special_tokens: bool = field(default=True, metadata={"help": ""})
     padding: bool = field(default=False, metadata={"help": ""})
     truncation: bool = field(default=None, metadata={"help": ""})
     max_length: Optional[int] = field(default=None, metadata={"help": ""})
     stride: int = field(default=0, metadata={"help": ""})
     is_split_into_words: bool = field(default=False, metadata={"help": ""})
-    tok_pad_to_multiple_of: Optional[int] = field(default=None, metadata={"help": ""})
-    tok_return_tensors: Optional[str] = field(default=None, metadata={"help": ""})
+    pad_to_multiple_of: Optional[int] = field(default=None, metadata={"help": ""})
+    return_tensors: Optional[str] = field(default=None, metadata={"help": ""})
     return_token_type_ids: Optional[bool] = field(default=None, metadata={"help": ""})
     return_attention_mask: Optional[bool] = field(default=None, metadata={"help": ""})
     return_overflowing_tokens: bool = field(default=False, metadata={"help": ""})
@@ -72,21 +89,23 @@ class ModelConfig:
     resume_download: bool = field(default=False, metadata={"help": "Resume an interrupted download"})
     proxies: Optional[str] = field(default=None, metadata={"help": "Proxy configuration for downloading the model"})
     output_loading_info: bool = field(default=False, metadata={"help": "Output additional loading information"})
-    allow_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"})
-    m_revision: str = field(default="main", metadata={"help": "Specific model version to use"})
+    local_files_only: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"})
+    revision: str = field(default="main", metadata={"help": "Specific model version to use"})
     trust_remote_code: bool = field(default=False, metadata={"help": "Enable trusting remote code when loading the model"})
-    m_load_in_4bit: bool = field(default=False, metadata={"help": "Load model in 4-bit mode"})
-    m_load_in_8bit: bool = field(default=False, metadata={"help": "Load model in 8-bit mode"})
+    load_in_4bit: bool = field(default=False, metadata={"help": "Load model in 4-bit mode"})
+    load_in_8bit: bool = field(default=False, metadata={"help": "Load model in 8-bit mode"})
     torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"})
     device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"})
 
     def __post_init__(self):
         """
         Post-initialization to validate some ModelConfig values
         """
-        if self.torch_dtype and not hasattr(torch, self.torch_dtype):
-            raise ValueError(f"Invalid torch dtype: {self.torch_dtype}")
-        self.torch_dtype = getattr(torch, self.torch_dtype) if self.torch_dtype else None
+        if self.torch_dtype:
+            if isinstance(self.torch_dtype, str) and hasattr(torch, self.torch_dtype):
+                self.torch_dtype = getattr(torch, self.torch_dtype)
+            elif not isinstance(self.torch_dtype, torch.dtype):
+                raise ValueError(f"Invalid torch dtype: {self.torch_dtype}")
 
 @dataclass
 class QuantizationConfig(BitsAndBytesConfig):
@@ -104,14 +123,6 @@ class QuantizationConfig(BitsAndBytesConfig):
     bnb_4bit_quant_type: str = field(default="fp4", metadata={"help": "Quantization type for 4-bit"})
     bnb_4bit_use_double_quant: bool = field(default=False, metadata={"help": "Use double quantization for 4-bit"})
 
-@dataclass
-class TrainingConfig:
-    """
-    Configuration for fine_tuning process
-    """
-    save_output_path: str = field(default=".", metadata={"help": "Path where fine_tuning output is saved"})
-    # Other fine_tuning-related configurations can go here
-
 # class CheckpointCallback(TrainerCallback):
 #     def on_train_end(self, args, state, control, **kwargs):
 #         model_path = args.output_dir