From f5a0862f9823c74ea7662686c5c11b452f68ce0b Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Mon, 5 Feb 2024 21:23:49 -0800
Subject: [PATCH 01/24] feat: clean up inference, add validation checks

---
 .../text-generation/inference-api.py          | 135 ++++++++++--------
 .../text-generation/requirements.txt          |   1 +
 2 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py
index 7aa8c8ea4..de204401c 100644
--- a/presets/inference/text-generation/inference-api.py
+++ b/presets/inference/text-generation/inference-api.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
-import argparse
 import os
+from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, List, Optional
 
 import GPUtil
@@ -9,62 +9,54 @@
 import transformers
 import uvicorn
 from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel, Field
-from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
-
+from pydantic import BaseModel, Extra, Field
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          GenerationConfig, HfArgumentParser)
+
+
+@dataclass
+class ModelConfig:
+    """
+    HuggingFace Model Configuration Parameters
+    """
+    pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"})
+    pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"})
+    state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"})
+    cache_dir: Optional[str] = field(default=None, metadata={"help": "Cache directory for the model"})
+    from_tf: bool = field(default=False, metadata={"help": "Load model from a TensorFlow checkpoint"})
+    force_download: bool = field(default=False, metadata={"help": "Force the download of the model"})
+    resume_download: bool = field(default=False, metadata={"help": "Resume an interrupted download"})
+    proxies: Optional[str] = field(default=None, metadata={"help": "Proxy configuration for downloading the model"})
+    output_loading_info: bool = field(default=False, metadata={"help": "Output additional loading information"})
+    use_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"})
+    revision: str = field(default="main", metadata={"help": "Specific model version to use"})
+    trust_remote_code: bool = field(default=False, metadata={"help": "Enable trusting remote code when loading the model"})
+    load_in_4bit: bool = field(default=False, metadata={"help": "Load model in 4-bit mode"})
+    load_in_8bit: bool = field(default=False, metadata={"help": "Load model in 8-bit mode"})
+    torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"})
+    device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"})
+    
+    def __post_init__(self):
+        if self.torch_dtype and not hasattr(torch, self.torch_dtype):
+            raise ValueError(f"Invalid torch dtype: {self.torch_dtype}")
+        self.torch_dtype = getattr(torch, self.torch_dtype) if self.torch_dtype else None
+
+        supported_pipelines = {"conversational", "text-generation"}
+        if self.pipeline not in supported_pipelines:
+            raise ValueError(f"Unsupported pipeline: {self.pipeline}")
+
+parser = HfArgumentParser(ModelConfig)
+args, unknown_args = parser.parse_args_into_dataclasses(
+    return_remaining_strings=True
+)
 
-def dtype_type(string):
-    if hasattr(torch, string):
-        return getattr(torch, string)
-    else:
-        raise ValueError(f"Invalid torch dtype: {string}")
-
-parser = argparse.ArgumentParser(description='Model Configuration')
-parser.add_argument('--pipeline', required=True, type=str, help='The model pipeline for the pre-trained model')
-parser.add_argument('--load_in_8bit', default=False, action='store_true', help='Load model in 8-bit mode')
-parser.add_argument('--trust_remote_code', default=False, action='store_true', help='Enable trusting remote code when loading the model')
-parser.add_argument('--torch_dtype', default=None, type=dtype_type, help='The torch dtype for the pre-trained model')
-parser.add_argument('--device_map', default="auto", type=str, help='The device map for the pre-trained model')
-parser.add_argument('--cache_dir', type=str, default=None, help='Cache directory for the model')
-parser.add_argument('--from_tf', action='store_true', default=False, help='Load model from a TensorFlow checkpoint')
-parser.add_argument('--force_download', action='store_true', default=False, help='Force the download of the model')
-parser.add_argument('--resume_download', action='store_true', default=False, help='Resume an interrupted download')
-parser.add_argument('--proxies', type=str, default=None, help='Proxy configuration for downloading the model')
-parser.add_argument('--revision', type=str, default="main", help='Specific model version to use')
-# parser.add_argument('--local_files_only', action='store_true', default=False, help='Only use local files for model loading')
-parser.add_argument('--output_loading_info', action='store_true', default=False, help='Output additional loading information')
-
-args = parser.parse_args()
+model_args = asdict(args)
+model_args["local_files_only"] = not model_args.pop('use_remote_files')
+model_pipeline = model_args.pop('pipeline')
 
 app = FastAPI()
-
-supported_pipelines = {"conversational", "text-generation"}
-if args.pipeline not in supported_pipelines:
-    raise HTTPException(status_code=400, detail="Invalid pipeline specified")
-
-model_kwargs = {
-    "cache_dir": args.cache_dir,
-    "from_tf": args.from_tf,
-    "force_download": args.force_download,
-    "resume_download": args.resume_download,
-    "proxies": args.proxies,
-    "revision": args.revision,
-    "output_loading_info": args.output_loading_info,
-    "trust_remote_code": args.trust_remote_code,
-    "device_map": args.device_map,
-    "local_files_only": True,
-}
-
-if args.load_in_8bit:
-    model_kwargs["load_in_8bit"] = args.load_in_8bit
-if args.torch_dtype:
-    model_kwargs["torch_dtype"] = args.torch_dtype
-
-tokenizer = AutoTokenizer.from_pretrained("/workspace/tfs/weights", **model_kwargs)
-model = AutoModelForCausalLM.from_pretrained(
-    "/workspace/tfs/weights",
-    **model_kwargs
-)
+tokenizer = AutoTokenizer.from_pretrained(**model_args)
+model = AutoModelForCausalLM.from_pretrained(**model_args)
 
 pipeline_kwargs = {
     "trust_remote_code": args.trust_remote_code,
@@ -75,7 +67,7 @@ def dtype_type(string):
     pipeline_kwargs["torch_dtype"] = args.torch_dtype
 
 pipeline = transformers.pipeline(
-    args.pipeline,
+    model_pipeline,
     model=model,
     tokenizer=tokenizer,
     **pipeline_kwargs
@@ -101,17 +93,42 @@ def health_check():
         raise HTTPException(status_code=500, detail="Pipeline not initialized")
     return {"status": "Healthy"}
 
+class GenerateKwargs(BaseModel):
+    max_length: int = 200
+    min_length: int = 0
+    do_sample: bool = True
+    early_stopping: bool = False
+    num_beams: int = 1
+    num_beam_groups: int = 1
+    diversity_penalty: float = 0.0
+    temperature: float = 1.0
+    top_k: int = 10
+    top_p: float = 1
+    typical_p: float = 1
+    repetition_penalty: float = 1
+    length_penalty: float = 1
+    no_repeat_ngram_size: int = 0
+    encoder_no_repeat_ngram_size: int = 0
+    bad_words_ids: Optional[List[int]] = None
+    num_return_sequences: int = 1
+    output_scores: bool = False
+    return_dict_in_generate: bool = False
+    pad_token_id: Optional[int] = tokenizer.pad_token_id
+    eos_token_id: Optional[int] = tokenizer.eos_token_id
+    forced_bos_token_id: Optional[int] = None
+    forced_eos_token_id: Optional[int] = None
+    remove_invalid_values: Optional[bool] = None
+    class Config:
+        extra = Extra.allow # Allows for additional fields not explicitly defined
+
 class UnifiedRequestModel(BaseModel):
     # Fields for text generation
     prompt: Optional[str] = Field(None, description="Prompt for text generation")
-    # Mutually Exclusive with return_full_text
-    # return_tensors: Optional[bool] = Field(False, description="Return tensors of predictions")
-    # return_text: Optional[bool] = Field(True, description="Return decoded texts in the outputs")
     return_full_text: Optional[bool] = Field(True, description="Return full text if True, else only added text")
     clean_up_tokenization_spaces: Optional[bool] = Field(False, description="Clean up extra spaces in text output")
     prefix: Optional[str] = Field(None, description="Prefix added to prompt")
     handle_long_generation: Optional[str] = Field(None, description="Strategy to handle long generation")
-    generate_kwargs: Optional[Dict[str, Any]] = Field(None, description="Additional kwargs for generate method")
+    generate_kwargs: Optional[GenerateKwargs] = Field(None, description="Additional kwargs for generate method")
 
     # Field for conversational model
     messages: Optional[List[dict]] = Field(None, description="Messages for conversational model")
diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt
index 2e8a5e33e..424159dba 100644
--- a/presets/inference/text-generation/requirements.txt
+++ b/presets/inference/text-generation/requirements.txt
@@ -6,5 +6,6 @@ fastapi==0.103.2
 pydantic==1.10.9
 uvicorn[standard]==0.23.2
 bitsandbytes==0.41.1
+scipy==1.10.1
 deepspeed==0.11.1
 gputil==1.4.0
\ No newline at end of file

From 8e7f6c3233490c18b17b1ba1375c9e9ee46bf848 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 6 Feb 2024 10:56:22 -0800
Subject: [PATCH 02/24] feat: unknown arg parsing added

---
 .../text-generation/inference-api.py          | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py
index de204401c..145a8db12 100644
--- a/presets/inference/text-generation/inference-api.py
+++ b/presets/inference/text-generation/inference-api.py
@@ -17,7 +17,7 @@
 @dataclass
 class ModelConfig:
     """
-    HuggingFace Model Configuration Parameters
+    Transformers Model Configuration Parameters
     """
     pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"})
     pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"})
@@ -36,7 +36,31 @@ class ModelConfig:
     torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"})
     device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"})
     
+    # Method to process unknown arguments
+    def process_unknown_args(self, unknown_args: List[str]):
+        """
+        Process unknown cmd line args and update the model configuration accordingly.
+        """
+        unknown_args_dict = {}
+        i = 0
+        while i < len(unknown_args):
+            key = unknown_args[i].lstrip('-')  # Remove leading dashes
+            if i + 1 < len(unknown_args) and not unknown_args[i + 1].startswith('--'):
+                value = unknown_args[i + 1]
+                i += 2  # Move past the current key-value pair
+            else:
+                value = True  # Assign a True value for standalone flags
+                i += 1  # Move to the next item
+            
+            unknown_args_dict[key] = value
+
+        # Update the ModelConfig instance with the unknown args
+        self.__dict__.update(unknown_args_dict)
+
     def __post_init__(self):
+        """
+        Post-initialization to validate some ModelConfig values
+        """
         if self.torch_dtype and not hasattr(torch, self.torch_dtype):
             raise ValueError(f"Invalid torch dtype: {self.torch_dtype}")
         self.torch_dtype = getattr(torch, self.torch_dtype) if self.torch_dtype else None
@@ -50,6 +74,8 @@ def __post_init__(self):
     return_remaining_strings=True
 )
 
+args.process_unknown_args(unknown_args)
+
 model_args = asdict(args)
 model_args["local_files_only"] = not model_args.pop('use_remote_files')
 model_pipeline = model_args.pop('pipeline')

From 8cc3f2ea42179b73620638dc30f4df241032475d Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 6 Feb 2024 11:17:52 -0800
Subject: [PATCH 03/24] feat: rename func

---
 .../text-generation/inference-api.py          | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py
index 145a8db12..c41cffa8e 100644
--- a/presets/inference/text-generation/inference-api.py
+++ b/presets/inference/text-generation/inference-api.py
@@ -36,26 +36,26 @@ class ModelConfig:
     torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"})
     device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"})
     
-    # Method to process unknown arguments
-    def process_unknown_args(self, unknown_args: List[str]):
+    # Method to process additional arguments
+    def process_additional_args(self, addt_args: List[str]):
         """
-        Process unknown cmd line args and update the model configuration accordingly.
+        Process additional cmd line args and update the model configuration accordingly.
         """
-        unknown_args_dict = {}
+        addt_args_dict = {}
         i = 0
-        while i < len(unknown_args):
-            key = unknown_args[i].lstrip('-')  # Remove leading dashes
-            if i + 1 < len(unknown_args) and not unknown_args[i + 1].startswith('--'):
-                value = unknown_args[i + 1]
+        while i < len(addt_args):
+            key = addt_args[i].lstrip('-')  # Remove leading dashes
+            if i + 1 < len(addt_args) and not addt_args[i + 1].startswith('--'):
+                value = addt_args[i + 1]
                 i += 2  # Move past the current key-value pair
             else:
                 value = True  # Assign a True value for standalone flags
                 i += 1  # Move to the next item
             
-            unknown_args_dict[key] = value
+            addt_args_dict[key] = value
 
-        # Update the ModelConfig instance with the unknown args
-        self.__dict__.update(unknown_args_dict)
+        # Update the ModelConfig instance with the additional args
+        self.__dict__.update(addt_args_dict)
 
     def __post_init__(self):
         """
@@ -70,11 +70,11 @@ def __post_init__(self):
             raise ValueError(f"Unsupported pipeline: {self.pipeline}")
 
 parser = HfArgumentParser(ModelConfig)
-args, unknown_args = parser.parse_args_into_dataclasses(
+args, additional_args = parser.parse_args_into_dataclasses(
     return_remaining_strings=True
 )
 
-args.process_unknown_args(unknown_args)
+args.process_additional_args(additional_args)
 
 model_args = asdict(args)
 model_args["local_files_only"] = not model_args.pop('use_remote_files')

From b6351097ab024311388f85e8b3b5c59b44f1776a Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 6 Feb 2024 11:24:56 -0800
Subject: [PATCH 04/24] fix: library version

---
 presets/inference/text-generation/inference-api.py | 3 ++-
 presets/inference/text-generation/requirements.txt | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py
index c41cffa8e..3fa9ddac9 100644
--- a/presets/inference/text-generation/inference-api.py
+++ b/presets/inference/text-generation/inference-api.py
@@ -101,7 +101,8 @@ def __post_init__(self):
 
 try:
     # Attempt to load the generation configuration
-    default_generate_config = GenerationConfig.from_pretrained("/workspace/tfs/weights", local_files_only=True).to_dict()
+    default_generate_config = GenerationConfig.from_pretrained(
+        args.pretrained_model_name_or_path, local_files_only=True).to_dict()
 except Exception as e:
     default_generate_config = {}
 
diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt
index 424159dba..93cf064d9 100644
--- a/presets/inference/text-generation/requirements.txt
+++ b/presets/inference/text-generation/requirements.txt
@@ -5,7 +5,6 @@ accelerate==0.23.0
 fastapi==0.103.2
 pydantic==1.10.9
 uvicorn[standard]==0.23.2
-bitsandbytes==0.41.1
-scipy==1.10.1
+bitsandbytes==0.42.0
 deepspeed==0.11.1
 gputil==1.4.0
\ No newline at end of file

From 6208731b5276a3100e1862fc0724b004b3336e76 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 6 Feb 2024 11:30:36 -0800
Subject: [PATCH 05/24] fix: formatting

---
 presets/inference/text-generation/inference-api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py
index 3fa9ddac9..d84379cab 100644
--- a/presets/inference/text-generation/inference-api.py
+++ b/presets/inference/text-generation/inference-api.py
@@ -102,7 +102,9 @@ def __post_init__(self):
 try:
     # Attempt to load the generation configuration
     default_generate_config = GenerationConfig.from_pretrained(
-        args.pretrained_model_name_or_path, local_files_only=True).to_dict()
+        args.pretrained_model_name_or_path, 
+        local_files_only=args.local_files_only
+    ).to_dict()
 except Exception as e:
     default_generate_config = {}
 

From 8d1757749008d7454c7072f98a9f089978e8140e Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Mon, 12 Feb 2024 15:51:06 -0800
Subject: [PATCH 06/24] fix: add some additional params

---
 presets/inference/text-generation/inference-api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py
index d84379cab..764d135aa 100644
--- a/presets/inference/text-generation/inference-api.py
+++ b/presets/inference/text-generation/inference-api.py
@@ -28,7 +28,7 @@ class ModelConfig:
     resume_download: bool = field(default=False, metadata={"help": "Resume an interrupted download"})
     proxies: Optional[str] = field(default=None, metadata={"help": "Proxy configuration for downloading the model"})
     output_loading_info: bool = field(default=False, metadata={"help": "Output additional loading information"})
-    use_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"})
+    allow_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"})
     revision: str = field(default="main", metadata={"help": "Specific model version to use"})
     trust_remote_code: bool = field(default=False, metadata={"help": "Enable trusting remote code when loading the model"})
     load_in_4bit: bool = field(default=False, metadata={"help": "Load model in 4-bit mode"})
@@ -77,7 +77,7 @@ def __post_init__(self):
 args.process_additional_args(additional_args)
 
 model_args = asdict(args)
-model_args["local_files_only"] = not model_args.pop('use_remote_files')
+model_args["local_files_only"] = not model_args.pop('allow_remote_files')
 model_pipeline = model_args.pop('pipeline')
 
 app = FastAPI()

From 02189bacdcdf4f1c0c640b80ea79106744486095 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Mon, 12 Feb 2024 20:31:58 -0800
Subject: [PATCH 07/24] fix: rename inference-api.py to support python naming

---
 docker/presets/tfs-onnx/Dockerfile            |  2 +-
 docker/presets/tfs/Dockerfile                 |  2 +-
 pkg/inference/preset-inferences.go            |  2 +-
 pkg/inference/preset-inferences_test.go       |  4 +-
 .../{inference-api.py => inference_api.py}    |  0
 .../{inference-api.py => inference_api.py}    |  0
 .../{inference-api.py => inference_api.py}    |  0
 .../text-generation/test_inference_api.py     | 84 +++++++++++++++++++
 .../falcon-40b-instruct.yaml                  |  2 +-
 .../test/manifests/falcon-40b/falcon-40b.yaml |  2 +-
 .../falcon-7b-instruct.yaml                   |  2 +-
 .../test/manifests/falcon-7b/falcon-7b.yaml   |  2 +-
 .../llama-2-13b-chat/llama-2-13b-chat.yaml    |  2 +-
 .../manifests/llama-2-13b/llama-2-13b.yaml    |  2 +-
 .../llama-2-7b-chat/llama-2-7b-chat.yaml      |  2 +-
 .../test/manifests/llama-2-7b/llama-2-7b.yaml |  2 +-
 .../mistral-7b-instruct.yaml                  |  2 +-
 .../test/manifests/mistral-7b/mistral-7b.yaml |  2 +-
 presets/test/manifests/phi-2/phi-2.yaml       |  2 +-
 19 files changed, 100 insertions(+), 16 deletions(-)
 rename presets/inference/llama2-chat/{inference-api.py => inference_api.py} (100%)
 rename presets/inference/llama2-completion/{inference-api.py => inference_api.py} (100%)
 rename presets/inference/text-generation/{inference-api.py => inference_api.py} (100%)
 create mode 100644 presets/inference/text-generation/test_inference_api.py

diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index 26b92f8ff..12e788346 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -17,7 +17,7 @@ RUN echo $VERSION > /workspace/tfs/version.txt
 COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY kaito/presets/inference/${MODEL_TYPE}/inference-api.py /workspace/tfs/inference-api.py
+COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
 
 # Convert to ONNX Runtime
 # RUN python convert_to_onnx.py ${MODEL_NAME} 
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index e5826027b..9ddcf9b93 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -16,7 +16,7 @@ RUN echo $VERSION > /workspace/tfs/version.txt
 COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY kaito/presets/inference/${MODEL_TYPE}/inference-api.py /workspace/tfs/inference-api.py
+COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
 
 # Copy the entire model weights to the weights directory
 COPY ${WEIGHTS_PATH} /workspace/tfs/weights
diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go
index 7d49bfece..9b02012b7 100644
--- a/pkg/inference/preset-inferences.go
+++ b/pkg/inference/preset-inferences.go
@@ -21,7 +21,7 @@ import (
 const (
 	ProbePath              = "/healthz"
 	Port5000               = int32(5000)
-	InferenceFile          = "inference-api.py"
+	InferenceFile          = "inference_api.py"
 	DefaultVolumeMountPath = "/dev/shm"
 )
 
diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go
index bb73b7894..cd8df067c 100644
--- a/pkg/inference/preset-inferences_test.go
+++ b/pkg/inference/preset-inferences_test.go
@@ -37,7 +37,7 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c  inference-api.py",
+			expectedCmd: "/bin/sh -c  inference_api.py",
 		},
 
 		"test-distributed-model": {
@@ -48,7 +48,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.Background()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
 			},
 			workload:    "StatefulSet",
-			expectedCmd: "/bin/sh -c  inference-api.py",
+			expectedCmd: "/bin/sh -c  inference_api.py",
 		},
 	}
 
diff --git a/presets/inference/llama2-chat/inference-api.py b/presets/inference/llama2-chat/inference_api.py
similarity index 100%
rename from presets/inference/llama2-chat/inference-api.py
rename to presets/inference/llama2-chat/inference_api.py
diff --git a/presets/inference/llama2-completion/inference-api.py b/presets/inference/llama2-completion/inference_api.py
similarity index 100%
rename from presets/inference/llama2-completion/inference-api.py
rename to presets/inference/llama2-completion/inference_api.py
diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference_api.py
similarity index 100%
rename from presets/inference/text-generation/inference-api.py
rename to presets/inference/text-generation/inference_api.py
diff --git a/presets/inference/text-generation/test_inference_api.py b/presets/inference/text-generation/test_inference_api.py
new file mode 100644
index 000000000..81561d1b7
--- /dev/null
+++ b/presets/inference/text-generation/test_inference_api.py
@@ -0,0 +1,84 @@
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from inference_api import app
+
+client = TestClient(app)
+
+# Non-Inference Endpoints
+def test_read_main():
+    response = client.get("/")
+    assert response.status_code == 200
+    assert response.json() == "Server is running"
+
+def test_health_check():
+    response = client.get("/healthz")
+    # Assume we have a GPU available and the model & pipeline initialized for testing
+    assert response.status_code == 200
+    assert response.json() == {"status": "Healthy"}
+
+def test_get_metrics():
+    response = client.get("/metrics")
+    assert response.status_code == 200
+    # Check the structure of the response to ensure GPU metrics are returned
+    assert "gpu_info" in response.json()
+
+# Inference Endpoint
+def test_text_generation():
+    request_data = {
+        "prompt": "Hello, world!",
+        "return_full_text": True,
+        "clean_up_tokenization_spaces": False,
+        "generate_kwargs": {"max_length": 50, "min_length": 10}  # Example generate_kwargs
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 200
+    data = response.json()
+    assert "Result" in data
+    assert len(data["Result"]) > 0  # Check if the result text is not empty
+
+def test_conversational():
+    messages = [
+        {"role": "user", "content": "What is your favourite condiment?"},
+        {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
+        {"role": "user", "content": "Do you have mayonnaise recipes?"}
+    ]
+    request_data = {
+        "messages": messages,
+        "generate_kwargs": {"max_length": 50}  # Example generate_kwargs for conversational
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 200
+    data = response.json()
+    assert "Result" in data
+    assert len(data["Result"]) > 0  # Check if the conversation result is not empty
+
+# Invalid tests
+def test_invalid_pipeline():
+    request_data = {
+        "prompt": "This should fail",
+        "pipeline": "invalid-pipeline"  # Invalid pipeline type
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 400  # Expecting a Bad Request response
+    assert "Invalid pipeline type" in response.json().get("detail", "")
+
+def test_missing_prompt():
+    request_data = {
+        # "prompt" is missing
+        "return_full_text": True,
+        "clean_up_tokenization_spaces": False
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 400  # Expecting a Bad Request response due to missing prompt
+    assert "Text generation parameter prompt required" in response.json().get("detail", "")
+
+def test_missing_messages_for_conversation():
+    request_data = {
+        # "messages" is missing for conversational pipeline
+        "pipeline": "conversational"
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 400  # Expecting a Bad Request response due to missing messages
+    assert "Conversational parameter messages required" in response.json().get("detail", "")
+
+
diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
index bd6280b9f..04c49b6ce 100644
--- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
+++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 4  # Requesting 4 GPUs
diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml
index a125d838d..96610a9fb 100644
--- a/presets/test/manifests/falcon-40b/falcon-40b.yaml
+++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 4  # Requesting 4 GPUs
diff --git a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
index ed8913e76..d742ee08b 100644
--- a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
+++ b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
diff --git a/presets/test/manifests/falcon-7b/falcon-7b.yaml b/presets/test/manifests/falcon-7b/falcon-7b.yaml
index 2f1aff077..ed86043e7 100644
--- a/presets/test/manifests/falcon-7b/falcon-7b.yaml
+++ b/presets/test/manifests/falcon-7b/falcon-7b.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
diff --git a/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
index 2cf9867e7..7ab7dcb21 100644
--- a/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
+++ b/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
@@ -35,7 +35,7 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference-api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
diff --git a/presets/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/test/manifests/llama-2-13b/llama-2-13b.yaml
index b3d885ce7..46c609bbb 100644
--- a/presets/test/manifests/llama-2-13b/llama-2-13b.yaml
+++ b/presets/test/manifests/llama-2-13b/llama-2-13b.yaml
@@ -35,7 +35,7 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference-api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
diff --git a/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
index b38cbfe3b..f26b003a8 100644
--- a/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
+++ b/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
@@ -19,7 +19,7 @@ spec:
           command:
             - /bin/sh
             - -c
-            - cd /workspace/llama/llama-2 && torchrun inference-api.py
+            - cd /workspace/llama/llama-2 && torchrun inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
diff --git a/presets/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/test/manifests/llama-2-7b/llama-2-7b.yaml
index 3e973bcd0..f68d43c64 100644
--- a/presets/test/manifests/llama-2-7b/llama-2-7b.yaml
+++ b/presets/test/manifests/llama-2-7b/llama-2-7b.yaml
@@ -19,7 +19,7 @@ spec:
           command:
             - /bin/sh
             - -c
-            - cd /workspace/llama/llama-2 && torchrun inference-api.py
+            - cd /workspace/llama/llama-2 && torchrun inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
diff --git a/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
index cacfbd484..e998ce0ed 100644
--- a/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
+++ b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
diff --git a/presets/test/manifests/mistral-7b/mistral-7b.yaml b/presets/test/manifests/mistral-7b/mistral-7b.yaml
index 287d435a7..5521ef2f8 100644
--- a/presets/test/manifests/mistral-7b/mistral-7b.yaml
+++ b/presets/test/manifests/mistral-7b/mistral-7b.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
diff --git a/presets/test/manifests/phi-2/phi-2.yaml b/presets/test/manifests/phi-2/phi-2.yaml
index b250d6248..9882b80cb 100644
--- a/presets/test/manifests/phi-2/phi-2.yaml
+++ b/presets/test/manifests/phi-2/phi-2.yaml
@@ -18,7 +18,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2

From 8dd38ac0e1661aeb1d1d2a17673fcccd7e99894d Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Mon, 12 Feb 2024 23:56:01 -0800
Subject: [PATCH 08/24] fix: add tests

---
 .../text-generation/inference_api.py          |   4 +-
 .../text-generation/tests/run_tests.sh        |  10 ++
 .../tests/test_inference_api.py               | 125 ++++++++++++++++++
 3 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 presets/inference/text-generation/tests/run_tests.sh
 create mode 100644 presets/inference/text-generation/tests/test_inference_api.py

diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index 764d135aa..73c7b5095 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -160,11 +160,11 @@ class UnifiedRequestModel(BaseModel):
     generate_kwargs: Optional[GenerateKwargs] = Field(None, description="Additional kwargs for generate method")
 
     # Field for conversational model
-    messages: Optional[List[dict]] = Field(None, description="Messages for conversational model")
+    messages: Optional[List[Dict[str, str]]] = Field(None, description="Messages for conversational model")
 
 @app.post("/chat")
 def generate_text(request_model: UnifiedRequestModel):
-    user_generate_kwargs = request_model.generate_kwargs or {}
+    user_generate_kwargs = request_model.generate_kwargs.dict() if request_model.generate_kwargs else {}
     generate_kwargs = {**default_generate_config, **user_generate_kwargs}
 
     if args.pipeline == "text-generation":
diff --git a/presets/inference/text-generation/tests/run_tests.sh b/presets/inference/text-generation/tests/run_tests.sh
new file mode 100644
index 000000000..6b4104e38
--- /dev/null
+++ b/presets/inference/text-generation/tests/run_tests.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+echo "Running text-generation tests..."
+python3 test_inference_api.py --pipeline text-generation --pretrained_model_name_or_path microsoft/phi-2 --allow_remote_files True
+
+echo "Running conversational tests..."
+python3 test_inference_api.py --pipeline conversational --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True
+
+echo "Running invalid-pipeline tests..."
+python3 test_inference_api.py --pipeline invalid-pipeline --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True
diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py
new file mode 100644
index 000000000..4d752a831
--- /dev/null
+++ b/presets/inference/text-generation/tests/test_inference_api.py
@@ -0,0 +1,125 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import sys
+from pathlib import Path
+
+# Get the parent directory of the current file
+parent_dir = str(Path(__file__).resolve().parent.parent)
+# Add the parent directory to sys.path
+sys.path.append(parent_dir)
+
+import argparse
+from unittest.mock import patch
+
+from fastapi.testclient import TestClient
+
+# Parse the command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("--pipeline", required=True, help="Pipeline type")
+parser.add_argument("--pretrained_model_name_or_path", required=True, help="Model path")
+parser.add_argument("--allow_remote_files", default=True, help="Allow models to be downloaded for tests")
+args = parser.parse_args()
+pipeline_type = args.pipeline
+
+try:
+    from inference_api import ModelConfig, app
+except ValueError as e:
+    if pipeline_type not in {"text-generation", "conversational"}:
+        # Pipeline is invalid, handle and exit
+        print(f"Correctly caught invalid pipeline during import")
+        sys.exit(0)
+    else:
+        raise
+except Exception as e:
+    # For all other exceptions, re-raise
+    raise
+    
+def run_tests(): 
+    client = TestClient(app)
+    test_read_main(client)
+    test_health_check(client)
+    test_get_metrics(client)
+    test_get_metrics_no_gpus(client)
+    # Pipeline must be valid to pass import
+    if pipeline_type == "text-generation":
+        test_text_generation(client)
+        test_missing_prompt(client)
+    elif pipeline_type == "conversational":
+        test_conversational(client)
+        test_missing_messages_for_conversation(client)
+        
+def test_read_main(client):
+    response = client.get("/")
+    server_msg, status_code = response.json()
+    assert server_msg == "Server is running"
+    assert status_code == 200
+    
+def test_health_check(client):
+    response = client.get("/healthz")    
+    # Assume we have a GPU available and the model & pipeline initialized for testing
+    assert response.status_code == 200
+    assert response.json() == {"status": "Healthy"}
+
+def test_get_metrics(client):
+    response = client.get("/metrics")
+    assert response.status_code == 200
+    # Check the structure of the response to ensure GPU metrics are returned
+    assert "gpu_info" in response.json()
+
+def test_get_metrics_no_gpus(client):
+    with patch('GPUtil.getGPUs', return_value=[]) as mock_getGPUs:
+        response = client.get("/metrics")
+        assert response.status_code == 200
+        assert response.json()["gpu_info"] == []  # Expecting an empty list
+
+def test_text_generation(client):
+    request_data = {
+        "prompt": "Hello, world!",
+        "return_full_text": True,
+        "clean_up_tokenization_spaces": False,
+        "generate_kwargs": {"max_length": 50, "min_length": 10}  # Example generate_kwargs
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 200
+    data = response.json()
+    assert "Result" in data
+    assert len(data["Result"]) > 0  # Check if the result text is not empty
+
+def test_missing_prompt(client):
+    request_data = {
+        # "prompt" is missing
+        "return_full_text": True,
+        "clean_up_tokenization_spaces": False,
+        "generate_kwargs": {"max_length": 50}
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 400  # Expecting a Bad Request response due to missing prompt
+    assert "Text generation parameter prompt required" in response.json().get("detail", "")
+    
+def test_conversational(client):
+    messages = [
+        {"role": "user", "content": "What is your favourite condiment?"},
+        {"role": "assistant", "content": "Well, Im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever Im cooking up in the kitchen!"},
+        {"role": "user", "content": "Do you have mayonnaise recipes?"}
+    ]
+    request_data = {
+        "messages": messages,
+        "generate_kwargs": {"max_new_tokens": 1000, "do_sample": True}
+    }
+    response = client.post("/chat", json=request_data)
+
+    assert response.status_code == 200
+    data = response.json()
+    assert "Result" in data
+    assert len(data["Result"]) > 0  # Check if the conversation result is not empty
+
+def test_missing_messages_for_conversation(client):
+    request_data = {
+        # "messages" is missing for conversational pipeline
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 400  # Expecting a Bad Request response due to missing messages
+    assert "Conversational parameter messages required" in response.json().get("detail", "")
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file

From b97712dec83a4eee4c00390a85c4eaa1cc5460b8 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Mon, 12 Feb 2024 23:57:16 -0800
Subject: [PATCH 09/24] fix: remove

---
 .../text-generation/test_inference_api.py     | 84 -------------------
 1 file changed, 84 deletions(-)
 delete mode 100644 presets/inference/text-generation/test_inference_api.py

diff --git a/presets/inference/text-generation/test_inference_api.py b/presets/inference/text-generation/test_inference_api.py
deleted file mode 100644
index 81561d1b7..000000000
--- a/presets/inference/text-generation/test_inference_api.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from fastapi import FastAPI
-from fastapi.testclient import TestClient
-from inference_api import app
-
-client = TestClient(app)
-
-# Non-Inference Endpoints
-def test_read_main():
-    response = client.get("/")
-    assert response.status_code == 200
-    assert response.json() == "Server is running"
-
-def test_health_check():
-    response = client.get("/healthz")
-    # Assume we have a GPU available and the model & pipeline initialized for testing
-    assert response.status_code == 200
-    assert response.json() == {"status": "Healthy"}
-
-def test_get_metrics():
-    response = client.get("/metrics")
-    assert response.status_code == 200
-    # Check the structure of the response to ensure GPU metrics are returned
-    assert "gpu_info" in response.json()
-
-# Inference Endpoint
-def test_text_generation():
-    request_data = {
-        "prompt": "Hello, world!",
-        "return_full_text": True,
-        "clean_up_tokenization_spaces": False,
-        "generate_kwargs": {"max_length": 50, "min_length": 10}  # Example generate_kwargs
-    }
-    response = client.post("/chat", json=request_data)
-    assert response.status_code == 200
-    data = response.json()
-    assert "Result" in data
-    assert len(data["Result"]) > 0  # Check if the result text is not empty
-
-def test_conversational():
-    messages = [
-        {"role": "user", "content": "What is your favourite condiment?"},
-        {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
-        {"role": "user", "content": "Do you have mayonnaise recipes?"}
-    ]
-    request_data = {
-        "messages": messages,
-        "generate_kwargs": {"max_length": 50}  # Example generate_kwargs for conversational
-    }
-    response = client.post("/chat", json=request_data)
-    assert response.status_code == 200
-    data = response.json()
-    assert "Result" in data
-    assert len(data["Result"]) > 0  # Check if the conversation result is not empty
-
-# Invalid tests
-def test_invalid_pipeline():
-    request_data = {
-        "prompt": "This should fail",
-        "pipeline": "invalid-pipeline"  # Invalid pipeline type
-    }
-    response = client.post("/chat", json=request_data)
-    assert response.status_code == 400  # Expecting a Bad Request response
-    assert "Invalid pipeline type" in response.json().get("detail", "")
-
-def test_missing_prompt():
-    request_data = {
-        # "prompt" is missing
-        "return_full_text": True,
-        "clean_up_tokenization_spaces": False
-    }
-    response = client.post("/chat", json=request_data)
-    assert response.status_code == 400  # Expecting a Bad Request response due to missing prompt
-    assert "Text generation parameter prompt required" in response.json().get("detail", "")
-
-def test_missing_messages_for_conversation():
-    request_data = {
-        # "messages" is missing for conversational pipeline
-        "pipeline": "conversational"
-    }
-    response = client.post("/chat", json=request_data)
-    assert response.status_code == 400  # Expecting a Bad Request response due to missing messages
-    assert "Conversational parameter messages required" in response.json().get("detail", "")
-
-

From 82bdb9715fef17fbaa54e74a6a1de6a5cf3b08cd Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 13 Feb 2024 15:20:41 -0800
Subject: [PATCH 10/24] fix: added comprehensive tests

---
 .../text-generation/requirements.txt          |   5 +-
 .../text-generation/tests/run_tests.sh        |  10 --
 .../tests/test_inference_api.py               | 170 +++++++++---------
 .../tests/test_model_config.py                |  89 +++++++++
 4 files changed, 178 insertions(+), 96 deletions(-)
 delete mode 100644 presets/inference/text-generation/tests/run_tests.sh
 create mode 100644 presets/inference/text-generation/tests/test_model_config.py

diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt
index 3bbbb3ab9..46e964b23 100644
--- a/presets/inference/text-generation/requirements.txt
+++ b/presets/inference/text-generation/requirements.txt
@@ -7,4 +7,7 @@ pydantic==1.10.9
 uvicorn[standard]==0.23.2
 bitsandbytes==0.42.0
 deepspeed==0.11.1
-gputil==1.4.0
\ No newline at end of file
+gputil==1.4.0
+# For UTs
+pytest=8.0.0
+httpx=0.26.0
\ No newline at end of file
diff --git a/presets/inference/text-generation/tests/run_tests.sh b/presets/inference/text-generation/tests/run_tests.sh
deleted file mode 100644
index 6b4104e38..000000000
--- a/presets/inference/text-generation/tests/run_tests.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-echo "Running text-generation tests..."
-python3 test_inference_api.py --pipeline text-generation --pretrained_model_name_or_path microsoft/phi-2 --allow_remote_files True
-
-echo "Running conversational tests..."
-python3 test_inference_api.py --pipeline conversational --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True
-
-echo "Running invalid-pipeline tests..."
-python3 test_inference_api.py --pipeline invalid-pipeline --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True
diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py
index 4d752a831..d25a67ed3 100644
--- a/presets/inference/text-generation/tests/test_inference_api.py
+++ b/presets/inference/text-generation/tests/test_inference_api.py
@@ -1,78 +1,76 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+import importlib
 import sys
 from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from fastapi.testclient import TestClient
 
 # Get the parent directory of the current file
 parent_dir = str(Path(__file__).resolve().parent.parent)
 # Add the parent directory to sys.path
 sys.path.append(parent_dir)
 
-import argparse
-from unittest.mock import patch
+@pytest.fixture(params=[
+    {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"},
+    {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"},
+])
+def configured_app(request):
+    original_argv = sys.argv.copy()
+    # Use request.param to set correct test arguments for each configuration
+    test_args = [
+        'program_name',
+        '--pipeline', request.param['pipeline'],
+        '--pretrained_model_name_or_path', request.param['model_path'],
+        '--torch_dtype', request.param['torch_dtype']
+    ]
+    sys.argv = test_args
 
-from fastapi.testclient import TestClient
+    import inference_api
+    importlib.reload(inference_api) # Reload to prevent module caching
+    from inference_api import app
 
-# Parse the command-line arguments
-parser = argparse.ArgumentParser()
-parser.add_argument("--pipeline", required=True, help="Pipeline type")
-parser.add_argument("--pretrained_model_name_or_path", required=True, help="Model path")
-parser.add_argument("--allow_remote_files", default=True, help="Allow models to be downloaded for tests")
-args = parser.parse_args()
-pipeline_type = args.pipeline
+    # Attach the request params to the app instance for access in tests
+    app.test_config = request.param
+    yield app
 
-try:
-    from inference_api import ModelConfig, app
-except ValueError as e:
-    if pipeline_type not in {"text-generation", "conversational"}:
-        # Pipeline is invalid, handle and exit
-        print(f"Correctly caught invalid pipeline during import")
-        sys.exit(0)
-    else:
-        raise
-except Exception as e:
-    # For all other exceptions, re-raise
-    raise
-    
-def run_tests(): 
-    client = TestClient(app)
-    test_read_main(client)
-    test_health_check(client)
-    test_get_metrics(client)
-    test_get_metrics_no_gpus(client)
-    # Pipeline must be valid to pass import
-    if pipeline_type == "text-generation":
-        test_text_generation(client)
-        test_missing_prompt(client)
-    elif pipeline_type == "conversational":
-        test_conversational(client)
-        test_missing_messages_for_conversation(client)
-        
-def test_read_main(client):
-    response = client.get("/")
-    server_msg, status_code = response.json()
-    assert server_msg == "Server is running"
-    assert status_code == 200
-    
-def test_health_check(client):
-    response = client.get("/healthz")    
-    # Assume we have a GPU available and the model & pipeline initialized for testing
-    assert response.status_code == 200
-    assert response.json() == {"status": "Healthy"}
+    sys.argv = original_argv
+
+def test_conversational(configured_app):
+    if configured_app.test_config['pipeline'] != 'conversational':
+        pytest.skip("Skipping non-conversational tests")
+    client = TestClient(configured_app)
+    messages = [
+        {"role": "user", "content": "What is your favourite condiment?"},
+        {"role": "assistant", "content": "Well, Im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever Im cooking up in the kitchen!"},
+        {"role": "user", "content": "Do you have mayonnaise recipes?"}
+    ]
+    request_data = {
+        "messages": messages,
+        "generate_kwargs": {"max_new_tokens": 20, "do_sample": True}
+    }
+    response = client.post("/chat", json=request_data)
 
-def test_get_metrics(client):
-    response = client.get("/metrics")
     assert response.status_code == 200
-    # Check the structure of the response to ensure GPU metrics are returned
-    assert "gpu_info" in response.json()
+    data = response.json()
+    assert "Result" in data
+    assert len(data["Result"]) > 0  # Check if the conversation result is not empty
 
-def test_get_metrics_no_gpus(client):
-    with patch('GPUtil.getGPUs', return_value=[]) as mock_getGPUs:
-        response = client.get("/metrics")
-        assert response.status_code == 200
-        assert response.json()["gpu_info"] == []  # Expecting an empty list
+def test_missing_messages_for_conversation(configured_app):
+    if configured_app.test_config['pipeline'] != 'conversational':
+        pytest.skip("Skipping non-conversational tests")
+    client = TestClient(configured_app)
+    request_data = {
+        # "messages" is missing for conversational pipeline
+    }
+    response = client.post("/chat", json=request_data)
+    assert response.status_code == 400  # Expecting a Bad Request response due to missing messages
+    assert "Conversational parameter messages required" in response.json().get("detail", "")
 
-def test_text_generation(client):
+def test_text_generation(configured_app):
+    if configured_app.test_config['pipeline'] != 'text-generation':
+        pytest.skip("Skipping non-text-generation tests")
+    client = TestClient(configured_app)
     request_data = {
         "prompt": "Hello, world!",
         "return_full_text": True,
@@ -85,7 +83,10 @@ def test_text_generation(client):
     assert "Result" in data
     assert len(data["Result"]) > 0  # Check if the result text is not empty
 
-def test_missing_prompt(client):
+def test_missing_prompt(configured_app):
+    if configured_app.test_config['pipeline'] != 'text-generation':
+        pytest.skip("Skipping non-text-generation tests")
+    client = TestClient(configured_app)
     request_data = {
         # "prompt" is missing
         "return_full_text": True,
@@ -95,31 +96,30 @@ def test_missing_prompt(client):
     response = client.post("/chat", json=request_data)
     assert response.status_code == 400  # Expecting a Bad Request response due to missing prompt
     assert "Text generation parameter prompt required" in response.json().get("detail", "")
-    
-def test_conversational(client):
-    messages = [
-        {"role": "user", "content": "What is your favourite condiment?"},
-        {"role": "assistant", "content": "Well, Im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever Im cooking up in the kitchen!"},
-        {"role": "user", "content": "Do you have mayonnaise recipes?"}
-    ]
-    request_data = {
-        "messages": messages,
-        "generate_kwargs": {"max_new_tokens": 1000, "do_sample": True}
-    }
-    response = client.post("/chat", json=request_data)
 
+def test_read_main(configured_app):
+    client = TestClient(configured_app)
+    response = client.get("/")
+    server_msg, status_code = response.json()
+    assert server_msg == "Server is running"
+    assert status_code == 200
+
+def test_health_check(configured_app):
+    client = TestClient(configured_app)
+    response = client.get("/healthz")
+    # Assume we have a GPU available and the model & pipeline initialized for testing
     assert response.status_code == 200
-    data = response.json()
-    assert "Result" in data
-    assert len(data["Result"]) > 0  # Check if the conversation result is not empty
+    assert response.json() == {"status": "Healthy"}
 
-def test_missing_messages_for_conversation(client):
-    request_data = {
-        # "messages" is missing for conversational pipeline
-    }
-    response = client.post("/chat", json=request_data)
-    assert response.status_code == 400  # Expecting a Bad Request response due to missing messages
-    assert "Conversational parameter messages required" in response.json().get("detail", "")
+def test_get_metrics(configured_app):
+    client = TestClient(configured_app)
+    response = client.get("/metrics")
+    assert response.status_code == 200
+    assert "gpu_info" in response.json()
 
-if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+def test_get_metrics_no_gpus(configured_app):
+    client = TestClient(configured_app)
+    with patch('GPUtil.getGPUs', return_value=[]) as mock_getGPUs:
+        response = client.get("/metrics")
+        assert response.status_code == 200
+        assert response.json()["gpu_info"] == []
diff --git a/presets/inference/text-generation/tests/test_model_config.py b/presets/inference/text-generation/tests/test_model_config.py
new file mode 100644
index 000000000..d0662ba0e
--- /dev/null
+++ b/presets/inference/text-generation/tests/test_model_config.py
@@ -0,0 +1,89 @@
+import importlib
+import sys
+from pathlib import Path
+
+import pytest
+
+# Get the parent directory of the current file
+parent_dir = str(Path(__file__).resolve().parent.parent)
+# Add the parent directory to sys.path
+sys.path.append(parent_dir)
+
+@pytest.fixture(params=[
+    {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"},
+    {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"},
+])
+def configured_model_config(request):
+    original_argv = sys.argv.copy()
+
+    sys.argv = [
+        'program_name',
+        '--pipeline', request.param['pipeline'],
+        '--pretrained_model_name_or_path', request.param['model_path'],
+        '--torch_dtype', request.param['torch_dtype']
+    ]
+
+    import inference_api
+    importlib.reload(inference_api)
+    from inference_api import ModelConfig
+
+    # Create and configure the ModelConfig instance
+    model_config = ModelConfig(
+        pipeline=request.param['pipeline'], 
+        pretrained_model_name_or_path=request.param['model_path'],
+        torch_dtype=request.param['torch_dtype']
+    )
+
+    yield model_config
+
+    # Restore the original sys.argv after the test is done
+    sys.argv = original_argv
+
+def test_process_additional_args(configured_model_config):
+    config = configured_model_config
+
+    # Simulate additional command-line arguments
+    additional_args = [
+        "--new_arg1", "value1",
+        "--new_arg2",
+        "--new_arg3", "value3",
+        "--flag_arg"
+    ]
+
+    # Process the additional arguments
+    config.process_additional_args(additional_args)
+
+    # Assertions to verify that additional arguments were processed correctly
+    assert getattr(config, "new_arg1", None) == "value1"
+    assert getattr(config, "new_arg2", None) is True
+    assert getattr(config, "new_arg3", None) == "value3"
+    assert getattr(config, "flag_arg", None) is True
+
+# Test case for ignoring arguments prefixed with '--' when expecting a value
+def test_ignore_double_dash_arguments(configured_model_config):
+    config = configured_model_config
+    additional_args = [
+        "--new_arg1", "--new_arg2",
+        "--new_arg3", "correct_value"
+    ]
+
+    config.process_additional_args(additional_args)
+
+    # new_arg1 should be set to True since its value is incorrectly prefixed with '--'
+    assert getattr(config, "new_arg1", None) is True
+    assert getattr(config, "new_arg2", None) is True
+    assert getattr(config, "new_arg3", None) == "correct_value"
+
+# Test case to verify handling unsupported pipeline values
+def test_unsupported_pipeline_raises_value_error(configured_model_config):
+    with pytest.raises(ValueError) as excinfo:
+        from inference_api import ModelConfig
+        ModelConfig(pipeline="unsupported_pipeline")
+    assert "Unsupported pipeline" in str(excinfo.value)
+
+# Test case for validating torch_dtype
+def test_invalid_torch_dtype_raises_value_error(configured_model_config):
+    with pytest.raises(ValueError) as excinfo:
+        from inference_api import ModelConfig
+        ModelConfig(pipeline="text-generation", torch_dtype="unsupported_dtype")
+    assert "Invalid torch dtype" in str(excinfo.value)
\ No newline at end of file

From fcb3f7d63f0fa24b2c1c33ce77775f70bf099bcf Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 13 Feb 2024 15:38:53 -0800
Subject: [PATCH 11/24] fix: typo

---
 presets/inference/text-generation/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt
index 46e964b23..6cfc08e21 100644
--- a/presets/inference/text-generation/requirements.txt
+++ b/presets/inference/text-generation/requirements.txt
@@ -9,5 +9,5 @@ bitsandbytes==0.42.0
 deepspeed==0.11.1
 gputil==1.4.0
 # For UTs
-pytest=8.0.0
-httpx=0.26.0
\ No newline at end of file
+pytest==8.0.0
+httpx==0.26.0
\ No newline at end of file

From fb3e7c24c5b9149e02d2420b5d4a3d10edd732c1 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 13 Feb 2024 16:18:49 -0800
Subject: [PATCH 12/24] fix: add git install

---
 docker/presets/llama-2/Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 5d888cc29..c065f59e2 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -7,6 +7,12 @@
 FROM python:3.8-slim
 WORKDIR /workspace
 
+# Install git
+RUN apt-get update && \
+    apt-get install -y git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+    
 RUN git clone https://github.com/facebookresearch/llama
 
 WORKDIR /workspace/llama

From 7758ba01f8d259e1c68579ab1ac9e60414beed76 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 13 Feb 2024 19:28:14 -0800
Subject: [PATCH 13/24] fix: sed using bash

---
 docker/presets/llama-2/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index c065f59e2..9b7f3d394 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -17,7 +17,8 @@ RUN git clone https://github.com/facebookresearch/llama
 
 WORKDIR /workspace/llama
 
-RUN sed -i $'/torch.distributed.init_process_group("nccl")/c\\            import datetime\\\n            torch.distributed.init_process_group("nccl", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py
+# RUN sed -i $'/torch.distributed.init_process_group("nccl")/c\\            import datetime\\\n            torch.distributed.init_process_group("nccl", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py
+RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\            import datetime\\\n            torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"]
 
 RUN pip install -e .
 RUN pip install torch==2.2.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0

From 77c7ed8b082185aa92d9f8cbabf88bdc8c89b7e0 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 13 Feb 2024 20:49:58 -0800
Subject: [PATCH 14/24] fix: add unit tests for cpp

---
 Makefile                                             |  2 ++
 .../text-generation/tests/test_inference_api.py      | 12 ++++++++----
 .../text-generation/tests/test_model_config.py       |  7 +++----
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index bd92718b1..fd9c5ca54 100644
--- a/Makefile
+++ b/Makefile
@@ -82,6 +82,8 @@ fmt: ## Run go fmt against code.
 unit-test: ## Run unit tests.
 	go test -v $(shell go list ./pkg/... ./api/... | grep -v /vendor) -race -coverprofile=coverage.txt -covermode=atomic
 	go tool cover -func=coverage.txt
+	pip install -r presets/inference/text-generation/requirements.txt
+	pytest -o log_cli=true -o log_cli_level=INFO .
 
 $(E2E_TEST):
 	(cd test/e2e && go test -c . -o $(E2E_TEST))
diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py
index d25a67ed3..ff9866bbb 100644
--- a/presets/inference/text-generation/tests/test_inference_api.py
+++ b/presets/inference/text-generation/tests/test_inference_api.py
@@ -4,6 +4,7 @@
 from unittest.mock import patch
 
 import pytest
+import torch
 from fastapi.testclient import TestClient
 
 # Get the parent directory of the current file
@@ -12,8 +13,8 @@
 sys.path.append(parent_dir)
 
 @pytest.fixture(params=[
-    {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"},
-    {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"},
+    {"pipeline": "text-generation", "model_path": "stanford-crfm/alias-gpt2-small-x21"},
+    {"pipeline": "conversational", "model_path": "stanford-crfm/alias-gpt2-small-x21"},
 ])
 def configured_app(request):
     original_argv = sys.argv.copy()
@@ -22,7 +23,7 @@ def configured_app(request):
         'program_name',
         '--pipeline', request.param['pipeline'],
         '--pretrained_model_name_or_path', request.param['model_path'],
-        '--torch_dtype', request.param['torch_dtype']
+        '--allow_remote_files', 'True'
     ]
     sys.argv = test_args
 
@@ -105,9 +106,12 @@ def test_read_main(configured_app):
     assert status_code == 200
 
 def test_health_check(configured_app):
+    device = "GPU" if torch.cuda.is_available() else "CPU"
+    if device != "GPU":
+        pytest.skip("Skipping healthz endpoint check - running on CPU")
     client = TestClient(configured_app)
     response = client.get("/healthz")
-    # Assume we have a GPU available and the model & pipeline initialized for testing
+    # Assuming we have a GPU available
     assert response.status_code == 200
     assert response.json() == {"status": "Healthy"}
 
diff --git a/presets/inference/text-generation/tests/test_model_config.py b/presets/inference/text-generation/tests/test_model_config.py
index d0662ba0e..df5b98e8d 100644
--- a/presets/inference/text-generation/tests/test_model_config.py
+++ b/presets/inference/text-generation/tests/test_model_config.py
@@ -10,8 +10,8 @@
 sys.path.append(parent_dir)
 
 @pytest.fixture(params=[
-    {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"},
-    {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"},
+    {"pipeline": "text-generation", "model_path": "stanford-crfm/alias-gpt2-small-x21"},
+    {"pipeline": "conversational", "model_path": "stanford-crfm/alias-gpt2-small-x21"},
 ])
 def configured_model_config(request):
     original_argv = sys.argv.copy()
@@ -20,7 +20,7 @@ def configured_model_config(request):
         'program_name',
         '--pipeline', request.param['pipeline'],
         '--pretrained_model_name_or_path', request.param['model_path'],
-        '--torch_dtype', request.param['torch_dtype']
+        '--allow_remote_files', 'True'
     ]
 
     import inference_api
@@ -31,7 +31,6 @@ def configured_model_config(request):
     model_config = ModelConfig(
         pipeline=request.param['pipeline'], 
         pretrained_model_name_or_path=request.param['model_path'],
-        torch_dtype=request.param['torch_dtype']
     )
 
     yield model_config

From 56cacc6a96aa88f8c0339f6304db66863621b0d7 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 13 Feb 2024 21:15:28 -0800
Subject: [PATCH 15/24] fix: need to rebuild models

---
 presets/models/supported_models.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index d5e4fea1f..f4fa1a87c 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -3,58 +3,58 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.1
+    tag: 0.0.2
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.1
+    tag: 0.0.2
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.1
+    tag: 0.0.2
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.1
+    tag: 0.0.2
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.1
+    tag: 0.0.2
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.1
+    tag: 0.0.2
 
   # Falcon
   - name: falcon-7b
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
   - name: falcon-40b
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
 
   # Mistral
   - name: mistral-7b
     type: text-generation 
     version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
   - name: mistral-7b-instruct
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/commit/9ab9e76e2b09f9f29ea2d56aa5bd139e4445c59e
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2

From cfe9ac7e3341938c93fa79210a4216274213edcd Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Wed, 14 Feb 2024 16:01:04 -0800
Subject: [PATCH 16/24] fix: makefile edit for unit tests

---
 .github/workflows/e2e-preset-test.yml    | 2 +-
 .github/workflows/preset-image-build.yml | 3 ++-
 .github/workflows/tests.yml              | 4 ++++
 Makefile                                 | 2 ++
 docker/presets/llama-2/Dockerfile        | 1 -
 5 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 641ac2ab1..db96dcec2 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -33,7 +33,7 @@ jobs:
       - name: Determine Affected Models
         id: affected_models
         run: |
-            PR_BRANCH=${{ github.head_ref }} \
+            PR_BRANCH=${{ github.ref_name }} \
             python3 .github/workflows/kind-cluster/determine_models.py
               
       - name: Print Determined Models
diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml
index 42ef6c7a6..fd86ac741 100644
--- a/.github/workflows/preset-image-build.yml
+++ b/.github/workflows/preset-image-build.yml
@@ -118,5 +118,6 @@ jobs:
         if: ${{ always() }}
         run: |
           kubectl get job --no-headers -o custom-columns=":metadata.name" | grep "^docker-build-job-${{ matrix.model.name }}-[0-9]" | xargs -r kubectl delete job
-          kubectl get pods --no-headers -o custom-columns=":metadata.name" | grep "^docker-build-job-${{ matrix.model.name }}-[0-9]" | xargs -r kubectl delete pod
+          # Job deletion above deletes associated pods
+          # kubectl get pods --no-headers -o custom-columns=":metadata.name" | grep "^docker-build-job-${{ matrix.model.name }}-[0-9]" | xargs -r kubectl delete pod
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5b04b414d..29d14ac24 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,6 +38,10 @@ jobs:
         run: |
           make unit-test
 
+      - name: Run inference api unit tests
+        run: |
+          make inference-api-e2e
+
       - name: Upload Codecov report
         uses: codecov/codecov-action@v4
         with:
diff --git a/Makefile b/Makefile
index fd9c5ca54..1a5445c48 100644
--- a/Makefile
+++ b/Makefile
@@ -82,6 +82,8 @@ fmt: ## Run go fmt against code.
 unit-test: ## Run unit tests.
 	go test -v $(shell go list ./pkg/... ./api/... | grep -v /vendor) -race -coverprofile=coverage.txt -covermode=atomic
 	go tool cover -func=coverage.txt
+
+inference-api-e2e: 
 	pip install -r presets/inference/text-generation/requirements.txt
 	pytest -o log_cli=true -o log_cli_level=INFO .
 
diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 9b7f3d394..103afb297 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -17,7 +17,6 @@ RUN git clone https://github.com/facebookresearch/llama
 
 WORKDIR /workspace/llama
 
-# RUN sed -i $'/torch.distributed.init_process_group("nccl")/c\\            import datetime\\\n            torch.distributed.init_process_group("nccl", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py
 RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\            import datetime\\\n            torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"]
 
 RUN pip install -e .

From 29ddf2e2e6cf328a2b17dbc839b6e4fd55835b11 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Wed, 14 Feb 2024 16:05:28 -0800
Subject: [PATCH 17/24] nit: add quotes

---
 .github/workflows/e2e-preset-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index db96dcec2..0e65a5591 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -65,7 +65,7 @@ jobs:
             #             COMBINED_MATRIX.append(combined)
             #             break
 
-            COMBINED_MATRIX=$(echo ${{ steps.affected_models.outputs.matrix }} | jq --argjson configs "$CONFIGS" -c '
+            COMBINED_MATRIX=$(echo '${{ steps.affected_models.outputs.matrix }}' | jq --argjson configs "$CONFIGS" -c '
                 map(. as $model | $configs[] | select(.name == $model.name) | $model + .)
             ')
 

From f05f56582854323c87a0d3122a33deda386ef5d1 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Wed, 14 Feb 2024 17:56:17 -0800
Subject: [PATCH 18/24] fix: pin torch version and increase e2e preset timeout

---
 .github/workflows/e2e-preset-test.yml | 2 +-
 docker/presets/llama-2/Dockerfile     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 0e65a5591..3003e352f 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -252,7 +252,7 @@ jobs:
       - name: Wait for Resource to be ready
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false'
         run: |
-            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }}
+            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s
         
       - name: Test home endpoint
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false'
diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 103afb297..8957ef749 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -20,7 +20,7 @@ WORKDIR /workspace/llama
 RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\            import datetime\\\n            torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"]
 
 RUN pip install -e .
-RUN pip install torch==2.2.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0
+RUN pip install torch==2.1.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0
 RUN pip install 'uvicorn[standard]'
 
 ARG WEIGHTS_PATH

From 53eccf19d864d5d683c0ef7a22c42e675d3e0f9b Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Sun, 18 Feb 2024 23:09:45 -0800
Subject: [PATCH 19/24] feat: update requirements

---
 docker/presets/tfs/Dockerfile                 |  2 +-
 .../text-generation/requirements.txt          |  2 +-
 presets/models/supported_models.yaml          | 21 ++++++++++++-------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index 9ddcf9b93..5a322b8bd 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8-slim
+FROM python:3.10-slim
 
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt
index 6cfc08e21..8a7c50dbe 100644
--- a/presets/inference/text-generation/requirements.txt
+++ b/presets/inference/text-generation/requirements.txt
@@ -1,5 +1,5 @@
 # Dependencies for TFS
-transformers==4.36.0
+transformers==4.37.2
 torch==2.2.0
 accelerate==0.23.0
 fastapi==0.109.1
diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index f4fa1a87c..0ce52b1e7 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -3,27 +3,27 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.2
+    tag: 0.0.3
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.2
+    tag: 0.0.3
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.2
+    tag: 0.0.3
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.2
+    tag: 0.0.3
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.2
+    tag: 0.0.3
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.2
+    tag: 0.0.3
 
   # Falcon
   - name: falcon-7b
@@ -55,6 +55,13 @@ models:
     tag: 0.0.2
   - name: mistral-7b-instruct
     type: text-generation
-    version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/commit/9ab9e76e2b09f9f29ea2d56aa5bd139e4445c59e
+    version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
     runtime: tfs
     tag: 0.0.2
+
+  # Phi-2
+  - name: phi-2
+    type: text-generation 
+    version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
+    runtime: tfs
+    tag: 0.0.1
\ No newline at end of file

From f6ec3d3cab2456bcacc0abee6070deebffb48a01 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Mon, 19 Feb 2024 14:36:30 -0800
Subject: [PATCH 20/24] fix: include rename in different PR

---
 pkg/inference/preset-inferences.go      | 2 +-
 pkg/inference/preset-inferences_test.go | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go
index 9b02012b7..7d49bfece 100644
--- a/pkg/inference/preset-inferences.go
+++ b/pkg/inference/preset-inferences.go
@@ -21,7 +21,7 @@ import (
 const (
 	ProbePath              = "/healthz"
 	Port5000               = int32(5000)
-	InferenceFile          = "inference_api.py"
+	InferenceFile          = "inference-api.py"
 	DefaultVolumeMountPath = "/dev/shm"
 )
 
diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go
index cd8df067c..bb73b7894 100644
--- a/pkg/inference/preset-inferences_test.go
+++ b/pkg/inference/preset-inferences_test.go
@@ -37,7 +37,7 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c  inference-api.py",
 		},
 
 		"test-distributed-model": {
@@ -48,7 +48,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.Background()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
 			},
 			workload:    "StatefulSet",
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c  inference-api.py",
 		},
 	}
 

From 8133e020208f8ec8ea86b79301029a6614b0734a Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 20 Feb 2024 15:54:14 -0800
Subject: [PATCH 21/24] nit: gpu change

---
 presets/test/manifests/phi-2/phi-2.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presets/test/manifests/phi-2/phi-2.yaml b/presets/test/manifests/phi-2/phi-2.yaml
index 9882b80cb..15a7fbde2 100644
--- a/presets/test/manifests/phi-2/phi-2.yaml
+++ b/presets/test/manifests/phi-2/phi-2.yaml
@@ -21,9 +21,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 2
+            nvidia.com/gpu: 1
           limits:
-            nvidia.com/gpu: 2  # Requesting 2 GPUs
+            nvidia.com/gpu: 1  # Requesting 1 GPUs
         livenessProbe:
           httpGet:
             path: /healthz

From 852776240e1936fafd60b11f8037838eb1071cc3 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 20 Feb 2024 16:17:30 -0800
Subject: [PATCH 22/24] fix: phi-2 needs more memory

---
 .github/e2e-preset-configs.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
index a499d28d6..e3cbbb8b9 100644
--- a/.github/e2e-preset-configs.json
+++ b/.github/e2e-preset-configs.json
@@ -41,7 +41,7 @@
         "name": "phi-2",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
-        "node-osdisk-size": 30
+        "node-osdisk-size": 50
       },
       {
         "name": "llama-2-7b",

From 0e428fd91bd12a731ba24fe6a7d87178e7e5026a Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 20 Feb 2024 22:41:44 -0800
Subject: [PATCH 23/24] fix: increase timeout

---
 .../test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml  | 1 +
 presets/test/manifests/falcon-40b/falcon-40b.yaml                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
index 04c49b6ce..8dd56c6a6 100644
--- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
+++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
@@ -12,6 +12,7 @@ spec:
       labels:
         app: falcon
     spec:
+      progressDeadlineSeconds: 1800
       containers:
       - name: falcon-container
         image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml
index 96610a9fb..5ca9716a0 100644
--- a/presets/test/manifests/falcon-40b/falcon-40b.yaml
+++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml
@@ -12,6 +12,7 @@ spec:
       labels:
         app: falcon
     spec:
+      progressDeadlineSeconds: 1800
       containers:
       - name: falcon-container
         image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE

From e1a6e41b4b20c038f29f0742218ac9318e58b827 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Tue, 20 Feb 2024 22:56:46 -0800
Subject: [PATCH 24/24] fix: increase timeout

---
 .../test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml | 2 +-
 presets/test/manifests/falcon-40b/falcon-40b.yaml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
index 8dd56c6a6..226a485a7 100644
--- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
+++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
@@ -3,6 +3,7 @@ kind: Deployment
 metadata:
   name: falcon-40b-instruct
 spec:
+  progressDeadlineSeconds: 1800
   replicas: 1
   selector:
     matchLabels:
@@ -12,7 +13,6 @@ spec:
       labels:
         app: falcon
     spec:
-      progressDeadlineSeconds: 1800
       containers:
       - name: falcon-container
         image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml
index 5ca9716a0..a4cb2d524 100644
--- a/presets/test/manifests/falcon-40b/falcon-40b.yaml
+++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml
@@ -3,6 +3,7 @@ kind: Deployment
 metadata:
   name: falcon-40b
 spec:
+  progressDeadlineSeconds: 1800
   replicas: 1
   selector:
     matchLabels:
@@ -12,7 +13,6 @@ spec:
       labels:
         app: falcon
     spec:
-      progressDeadlineSeconds: 1800
       containers:
       - name: falcon-container
         image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE