From 624f47929d2a4d18475fa6366eb807491c251fdb Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Mon, 11 Mar 2024 22:48:32 -0700
Subject: [PATCH 01/15] Initial Dockerfile and fastapi implementation

---
 docker/presets/llama-2/Dockerfile                    | 5 ++++-
 docker/presets/tfs-onnx/Dockerfile                   | 4 +++-
 docker/presets/tfs/Dockerfile                        | 4 +++-
 presets/inference/llama2-chat/inference_api.py       | 7 +++++++
 presets/inference/llama2-completion/inference_api.py | 7 +++++++
 presets/inference/text-generation/inference_api.py   | 7 +++++++
 6 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 641d158bc..537d501a9 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -26,8 +26,11 @@ RUN pip install 'uvicorn[standard]'
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
-# Write the version to a file
+ARG IMAGE_NAME
+
+# Write metadata to .txt files
 RUN echo $VERSION > /workspace/llama/version.txt
+RUN echo $IMAGE_NAME > /workspace/llama/model_name.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index 12e788346..41ee80f0b 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -4,12 +4,14 @@ FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu118-py38-torch211
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
+ARG IMAGE_NAME
 
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write the version to a file
+# Write metadata to .txt files
 RUN echo $VERSION > /workspace/tfs/version.txt
+RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt
 
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index 5a322b8bd..2bb446342 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -3,12 +3,14 @@ FROM python:3.10-slim
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
+ARG IMAGE_NAME
 
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write the version to a file
+# Write metadata to .txt files
 RUN echo $VERSION > /workspace/tfs/version.txt
+RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py
index 11776bf3d..9691db010 100644
--- a/presets/inference/llama2-chat/inference_api.py
+++ b/presets/inference/llama2-chat/inference_api.py
@@ -191,6 +191,13 @@ def get_metrics():
         except Exception as e:
             return {"error": str(e)}
 
+    @app_main.get("/version")
+    def health_check():
+        with open("/workspace/llama/model_name.txt", "r") as f:
+            model_name = f.read()
+
+        return {"version": model_name}
+
 def setup_worker_routes():
     @app_worker.get("/healthz")
     def health_check():
diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py
index cf500146a..9adb230c1 100644
--- a/presets/inference/llama2-completion/inference_api.py
+++ b/presets/inference/llama2-completion/inference_api.py
@@ -180,6 +180,13 @@ def get_metrics():
         except Exception as e:
             return {"error": str(e)}
 
+    @app_main.get("/version")
+    def health_check():
+        with open("/workspace/tfs/model_name.txt", "r") as f:
+            model_name = f.read()
+
+        return {"version": model_name}
+
 def setup_worker_routes(): 
     @app_worker.get("/healthz")
     def health_check():
diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index 73c7b5095..ad713d075 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -222,6 +222,13 @@ def get_metrics():
     except Exception as e:
         return {"error": str(e)}
 
+@app.get("/version")
+def health_check():
+    with open("/workspace/tfs/model_name.txt", "r") as f:
+        model_name = f.read()
+
+    return {"version": model_name}
+
 if __name__ == "__main__":
     local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set
     port = 5000 + local_rank # Adjust port based on local rank

From 484c0addc7819318fb539ecd7ad7c9b50356702c Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Wed, 13 Mar 2024 13:56:25 -0700
Subject: [PATCH 02/15] Rename, add constants, template update

---
 .github/workflows/kind-cluster/docker-job-template.yaml | 1 +
 docker/presets/llama-2/Dockerfile                       | 5 ++---
 docker/presets/tfs-onnx/Dockerfile                      | 5 ++---
 docker/presets/tfs/Dockerfile                           | 5 ++---
 presets/inference/llama2-chat/inference_api.py          | 7 +++++--
 presets/inference/llama2-completion/inference_api.py    | 7 +++++--
 presets/inference/text-generation/inference_api.py      | 6 ++++--
 7 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/kind-cluster/docker-job-template.yaml b/.github/workflows/kind-cluster/docker-job-template.yaml
index a19860f88..64d86da54 100644
--- a/.github/workflows/kind-cluster/docker-job-template.yaml
+++ b/.github/workflows/kind-cluster/docker-job-template.yaml
@@ -43,6 +43,7 @@ spec:
                          --build-arg WEIGHTS_PATH=/weights \
                          --build-arg VERSION={{VERSION}} \
                          --build-arg MODEL_TYPE={{MODEL_TYPE}} \
+                         --build-arg IMAGE_NAME={{IMAGE_NAME}} \
                          -f $DOCKERFILE_PATH /
             docker push $ACR_NAME.azurecr.io/{{IMAGE_NAME}}:$VERSION
         env:
diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 537d501a9..e26e64043 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -28,9 +28,8 @@ ARG MODEL_TYPE
 ARG VERSION
 ARG IMAGE_NAME
 
-# Write metadata to .txt files
-RUN echo $VERSION > /workspace/llama/version.txt
-RUN echo $IMAGE_NAME > /workspace/llama/model_name.txt
+# Write metadata to model_info.txt file
+RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index 41ee80f0b..ed39e3555 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -9,9 +9,8 @@ ARG IMAGE_NAME
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write metadata to .txt files
-RUN echo $VERSION > /workspace/tfs/version.txt
-RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt
+# Write metadata to model_info.txt file
+RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt
 
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index 2bb446342..a8d5e7587 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -8,9 +8,8 @@ ARG IMAGE_NAME
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write metadata to .txt files
-RUN echo $VERSION > /workspace/tfs/version.txt
-RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt
+# Write metadata to model_info.txt file
+RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py
index 9691db010..b3a7a909c 100644
--- a/presets/inference/llama2-chat/inference_api.py
+++ b/presets/inference/llama2-chat/inference_api.py
@@ -18,6 +18,9 @@
 from llama import Llama
 from pydantic import BaseModel
 
+# Constants
+MODEL_INFO = "model_info.txt"
+
 # Setup argparse
 parser = argparse.ArgumentParser(description="Llama API server.")
 parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.")
@@ -192,8 +195,8 @@ def get_metrics():
             return {"error": str(e)}
 
     @app_main.get("/version")
-    def health_check():
-        with open("/workspace/llama/model_name.txt", "r") as f:
+    def get_version():
+        with open(f"/workspace/llama/{MODEL_INFO}", "r") as f:
             model_name = f.read()
 
         return {"version": model_name}
diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py
index 9adb230c1..612b4e789 100644
--- a/presets/inference/llama2-completion/inference_api.py
+++ b/presets/inference/llama2-completion/inference_api.py
@@ -18,6 +18,9 @@
 from llama import Llama
 from pydantic import BaseModel
 
+# Constants
+MODEL_INFO = "model_info.txt"
+
 # Setup argparse
 parser = argparse.ArgumentParser(description="Llama API server.")
 parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.")
@@ -181,8 +184,8 @@ def get_metrics():
             return {"error": str(e)}
 
     @app_main.get("/version")
-    def health_check():
-        with open("/workspace/tfs/model_name.txt", "r") as f:
+    def get_version():
+        with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
             model_name = f.read()
 
         return {"version": model_name}
diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index ad713d075..b50a5884e 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -13,6 +13,8 @@
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           GenerationConfig, HfArgumentParser)
 
+# Constants
+MODEL_INFO = "model_info.txt"
 
 @dataclass
 class ModelConfig:
@@ -223,8 +225,8 @@ def get_metrics():
         return {"error": str(e)}
 
 @app.get("/version")
-def health_check():
-    with open("/workspace/tfs/model_name.txt", "r") as f:
+def get_version():
+    with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
         model_name = f.read()
 
     return {"version": model_name}

From 031735db71cffad287da7505a76508b32c30e8a1 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Wed, 13 Mar 2024 17:54:12 -0700
Subject: [PATCH 03/15] Fix formatting

---
 docker/presets/llama-2/Dockerfile  | 2 +-
 docker/presets/tfs-onnx/Dockerfile | 2 +-
 docker/presets/tfs/Dockerfile      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index e26e64043..e5854f805 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -29,7 +29,7 @@ ARG VERSION
 ARG IMAGE_NAME
 
 # Write metadata to model_info.txt file
-RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt
+RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/llama/model_info.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index ed39e3555..a13dabe28 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -10,7 +10,7 @@ ARG IMAGE_NAME
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.txt file
-RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt
+RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt
 
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index a8d5e7587..4f04df4c3 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -9,7 +9,7 @@ ARG IMAGE_NAME
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.txt file
-RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt
+RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and

From afc70300ae0005485a1167e2add7fc4d588b2c19 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Thu, 14 Mar 2024 13:53:55 -0700
Subject: [PATCH 04/15] Version and endpoint

---
 .github/workflows/e2e-preset-test.yml |  5 +++++
 presets/models/supported_models.yaml  | 27 +++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 06dd5ac3b..c22e7d6cc 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -274,6 +274,11 @@ jobs:
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
         run: |
             curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
+
+      - name: Test version endpoint
+        if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
+        run: |
+            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version
     
       - name: Test inference endpoint
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index 3f8dda09e..11c88cb0f 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -3,28 +3,29 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Version endpoint
     # 0.0.3 - Inference API Cleanup (#233)
     # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244)
     # 0.0.1 - Initial Release
@@ -34,23 +35,24 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
   - name: falcon-40b
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Version endpoint
     # 0.0.2 - Inference API Cleanup (#233)
     # 0.0.1 - Initial Release
 
@@ -59,13 +61,14 @@ models:
     type: text-generation 
     version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
   - name: mistral-7b-instruct
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Version endpoint
     # 0.0.2 - Inference API Cleanup (#233)
     # 0.0.1 - Initial Release
 

From e85ee06d947bc82e4a1448fe2f9d0bad25006e4c Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Thu, 14 Mar 2024 14:10:42 -0700
Subject: [PATCH 05/15] Resync

---
 presets/models/falcon/model.go  |  8 ++++----
 presets/models/mistral/model.go | 12 ++++++------
 presets/models/phi/model.go     |  6 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go
index 00dfd0d77..7501dce23 100644
--- a/presets/models/falcon/model.go
+++ b/presets/models/falcon/model.go
@@ -37,10 +37,10 @@ var (
 	PresetFalcon40BInstructModel = PresetFalcon40BModel + "-instruct"
 
 	PresetFalconTagMap = map[string]string{
-		"Falcon7B":          "0.0.2",
-		"Falcon7BInstruct":  "0.0.2",
-		"Falcon40B":         "0.0.2",
-		"Falcon40BInstruct": "0.0.2",
+		"Falcon7B":          "0.0.3",
+		"Falcon7BInstruct":  "0.0.3",
+		"Falcon40B":         "0.0.3",
+		"Falcon40BInstruct": "0.0.3",
 	}
 
 	baseCommandPresetFalcon = "accelerate launch"
diff --git a/presets/models/mistral/model.go b/presets/models/mistral/model.go
index 3f1d79d15..7089eafb6 100644
--- a/presets/models/mistral/model.go
+++ b/presets/models/mistral/model.go
@@ -23,12 +23,12 @@ func init() {
 }
 
 var (
-	PresetMistral7BModel          = "mistral-7b"
-	PresetMistral7BInstructModel  = PresetMistral7BModel + "-instruct"
+	PresetMistral7BModel         = "mistral-7b"
+	PresetMistral7BInstructModel = PresetMistral7BModel + "-instruct"
 
 	PresetMistralTagMap = map[string]string{
-		"Mistral7B":          "0.0.2",
-		"Mistral7BInstruct":  "0.0.2",
+		"Mistral7B":         "0.0.3",
+		"Mistral7BInstruct": "0.0.3",
 	}
 
 	baseCommandPresetMistral = "accelerate launch"
@@ -46,7 +46,7 @@ func (*mistral7b) GetInferenceParameters() *model.PresetInferenceParam {
 	return &model.PresetInferenceParam{
 		ModelFamilyName:           "Mistral",
 		ImageAccessMode:           string(kaitov1alpha1.ModelImageAccessModePublic),
-		DiskStorageRequirement:    "50Gi",
+		DiskStorageRequirement:    "100Gi",
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement.
@@ -70,7 +70,7 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetInferenceParam {
 	return &model.PresetInferenceParam{
 		ModelFamilyName:           "Mistral",
 		ImageAccessMode:           string(kaitov1alpha1.ModelImageAccessModePublic),
-		DiskStorageRequirement:    "50Gi",
+		DiskStorageRequirement:    "100Gi",
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement.
diff --git a/presets/models/phi/model.go b/presets/models/phi/model.go
index e819256b6..2e54dce38 100644
--- a/presets/models/phi/model.go
+++ b/presets/models/phi/model.go
@@ -19,10 +19,10 @@ func init() {
 }
 
 var (
-	PresetPhi2Model          = "phi-2"
+	PresetPhi2Model = "phi-2"
 
 	PresetPhiTagMap = map[string]string{
-		"Phi2":          "0.0.1",
+		"Phi2": "0.0.2",
 	}
 
 	baseCommandPresetPhi = "accelerate launch"
@@ -40,7 +40,7 @@ func (*phi2) GetInferenceParameters() *model.PresetInferenceParam {
 	return &model.PresetInferenceParam{
 		ModelFamilyName:           "Phi",
 		ImageAccessMode:           string(kaitov1alpha1.ModelImageAccessModePublic),
-		DiskStorageRequirement:    "30Gi",
+		DiskStorageRequirement:    "50Gi",
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "12Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.

From f29877754e01a124110f0fc5617aec03a2df88cb Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Thu, 14 Mar 2024 14:12:48 -0700
Subject: [PATCH 06/15] Phi2

---
 presets/models/supported_models.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index 7d85d07c8..97c2a2697 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -79,7 +79,8 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Version endpoint
     # 0.0.2 - Update Default Params (#294)
     # 0.0.1 - Initial Release

From 478f68b6fdc8a582e5a4edc505d2d7372e31edc3 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Tue, 19 Mar 2024 20:21:11 -0700
Subject: [PATCH 07/15] Version

---
 presets/models/supported_models.yaml | 45 ++++++++++++++++------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index 97c2a2697..6ac9480d7 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -3,27 +3,27 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.4
+    version: 0.0.4
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.4
+    version: 0.0.4
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.4
+    version: 0.0.4
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.4
+    version: 0.0.4
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.4
+    version: 0.0.4
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.4
+    version: 0.0.4
     # Tag history:
     # 0.0.4 - Version endpoint
     # 0.0.3 - Inference API Cleanup (#233)
@@ -33,24 +33,28 @@ models:
   # Falcon
   - name: falcon-7b
     type: text-generation
-    version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
+    revisionID: 898df1396f35e447d5fe44e0a3ccaaaa69f30d36
+    commitID: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.4
+    version: 0.0.4
   - name: falcon-7b-instruct
     type: text-generation
-    version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
+    revisionID: cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
+    commitID: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.4
+    version: 0.0.4
   - name: falcon-40b
     type: text-generation
-    version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
+    revisionID: 4a70170c215b36a3cce4b4253f6d0612bb7d4146
+    commitURL: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.4
+    version: 0.0.4
   - name: falcon-40b-instruct
     type: text-generation
-    version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
+    revisionID: ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
+    commitURL: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.4
+    version: 0.0.4
     # Tag history:
     # 0.0.4 - Version endpoint
     # 0.0.3 - Update Default Params (#294)
@@ -60,14 +64,16 @@ models:
   # Mistral
   - name: mistral-7b
     type: text-generation 
-    version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
+    revisionID: 26bca36bde8333b5d7f72e9ed20ccda6a618af24
+    commitURL: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
     runtime: tfs
     tag: 0.0.4
   - name: mistral-7b-instruct
     type: text-generation
-    version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
+    revisionID: b70aa86578567ba3301b21c8a27bea4e8f6d6d61
+    commitURL: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
     runtime: tfs
-    tag: 0.0.4
+    version: 0.0.4
     # Tag history:
     # 0.0.4 - Version endpoint
     # 0.0.3 - Update Default Params (#294)
@@ -77,9 +83,10 @@ models:
   # Phi-2
   - name: phi-2
     type: text-generation 
-    version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
+    revisionID: b10c3eba545ad279e7208ee3a5d644566f001670
+    commitURL: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
     runtime: tfs
-    tag: 0.0.3
+    version: 0.0.3
     # Tag history:
     # 0.0.3 - Version endpoint
     # 0.0.2 - Update Default Params (#294)

From 60bada85658716104ad7654cbc46cfcbd1df60a3 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Wed, 27 Mar 2024 13:25:12 -0700
Subject: [PATCH 08/15] Versioning fixes

---
 docker/presets/llama-2/Dockerfile    |  3 ++-
 docker/presets/tfs/Dockerfile        |  2 +-
 presets/models/supported_models.yaml | 21 +++++++--------------
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index e5854f805..c9e10fc40 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -3,6 +3,7 @@
 #              --build-arg WEIGHTS_PATH=/weights \
 #              --build-arg VERSION={{VERSION}} \
 #              --build-arg MODEL_TYPE={{MODEL_TYPE}} \
+#              --build-arg IMAGE_NAME={{IMAGE_NAME}} \
 
 FROM python:3.8-slim
 WORKDIR /workspace
@@ -29,7 +30,7 @@ ARG VERSION
 ARG IMAGE_NAME
 
 # Write metadata to model_info.txt file
-RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/llama/model_info.txt
+RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index 4f04df4c3..c2c0a40f2 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -9,7 +9,7 @@ ARG IMAGE_NAME
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.txt file
-RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt
+RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index 6ac9480d7..460f5210e 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -33,26 +33,22 @@ models:
   # Falcon
   - name: falcon-7b
     type: text-generation
-    revisionID: 898df1396f35e447d5fe44e0a3ccaaaa69f30d36
-    commitID: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
+    version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
     version: 0.0.4
   - name: falcon-7b-instruct
     type: text-generation
-    revisionID: cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
-    commitID: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
+    version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
     version: 0.0.4
   - name: falcon-40b
     type: text-generation
-    revisionID: 4a70170c215b36a3cce4b4253f6d0612bb7d4146
-    commitURL: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
+    version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
     version: 0.0.4
   - name: falcon-40b-instruct
     type: text-generation
-    revisionID: ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
-    commitURL: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
+    version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
     version: 0.0.4
     # Tag history:
@@ -64,14 +60,12 @@ models:
   # Mistral
   - name: mistral-7b
     type: text-generation 
-    revisionID: 26bca36bde8333b5d7f72e9ed20ccda6a618af24
-    commitURL: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
+    version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
     runtime: tfs
     tag: 0.0.4
   - name: mistral-7b-instruct
     type: text-generation
-    revisionID: b70aa86578567ba3301b21c8a27bea4e8f6d6d61
-    commitURL: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
+    version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
     runtime: tfs
     version: 0.0.4
     # Tag history:
@@ -83,8 +77,7 @@ models:
   # Phi-2
   - name: phi-2
     type: text-generation 
-    revisionID: b10c3eba545ad279e7208ee3a5d644566f001670
-    commitURL: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
+    version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
     runtime: tfs
     version: 0.0.3
     # Tag history:

From 29f14e08a1035c3d83d35db8a97ac6b1b63a99b9 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Sat, 30 Mar 2024 18:23:58 -0700
Subject: [PATCH 09/15] Adding MODEL_VERSION into .txt file

---
 .../kind-cluster/docker-job-template.yaml     |  1 +
 docker/presets/llama-2/Dockerfile             |  4 +++-
 docker/presets/tfs-onnx/Dockerfile            |  3 ++-
 docker/presets/tfs/Dockerfile                 |  3 ++-
 .../llama2-completion/inference_api.py        |  5 +++-
 presets/models/supported_models.yaml          | 24 +++++++++----------
 6 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/kind-cluster/docker-job-template.yaml b/.github/workflows/kind-cluster/docker-job-template.yaml
index 64d86da54..99954233f 100644
--- a/.github/workflows/kind-cluster/docker-job-template.yaml
+++ b/.github/workflows/kind-cluster/docker-job-template.yaml
@@ -44,6 +44,7 @@ spec:
                          --build-arg VERSION={{VERSION}} \
                          --build-arg MODEL_TYPE={{MODEL_TYPE}} \
                          --build-arg IMAGE_NAME={{IMAGE_NAME}} \
+                         --build-arg MODEL_VERSION={{MODEL_VERSION}} \
                          -f $DOCKERFILE_PATH /
             docker push $ACR_NAME.azurecr.io/{{IMAGE_NAME}}:$VERSION
         env:
diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index c9e10fc40..7c44a83d4 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -4,6 +4,7 @@
 #              --build-arg VERSION={{VERSION}} \
 #              --build-arg MODEL_TYPE={{MODEL_TYPE}} \
 #              --build-arg IMAGE_NAME={{IMAGE_NAME}} \
+#              --build-arg MODEL_VERSION={{MODEL_VERSION}} \
 
 FROM python:3.8-slim
 WORKDIR /workspace
@@ -28,9 +29,10 @@ ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
 ARG IMAGE_NAME
+ARG MODEL_VERSION
 
 # Write metadata to model_info.txt file
-RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt
+RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/llama/model_info.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index a13dabe28..27463d1a5 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -5,12 +5,13 @@ ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
 ARG IMAGE_NAME
+ARG MODEL_VERSION
 
 # Set the working directory
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.txt file
-RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt
+RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt
 
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index c2c0a40f2..39aebe1bf 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -4,12 +4,13 @@ ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
 ARG IMAGE_NAME
+ARG MODEL_VERSION
 
 # Set the working directory
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.txt file
-RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt
+RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py
index 612b4e789..4f70fbda1 100644
--- a/presets/inference/llama2-completion/inference_api.py
+++ b/presets/inference/llama2-completion/inference_api.py
@@ -8,6 +8,7 @@
 import signal
 import sys
 import threading
+import json
 from typing import Optional
 
 import GPUtil
@@ -188,7 +189,9 @@ def get_version():
         with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
             model_name = f.read()
 
-        return {"version": model_name}
+        # Convert readable text file to json
+        data = json.dumps(dict(map(str.strip, line.split(':'))) for line in model_name.split('\n'))
+        return data
 
 def setup_worker_routes(): 
     @app_worker.get("/healthz")
diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index 460f5210e..97c2a2697 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -3,27 +3,27 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    version: 0.0.4
+    tag: 0.0.4
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    version: 0.0.4
+    tag: 0.0.4
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    version: 0.0.4
+    tag: 0.0.4
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    version: 0.0.4
+    tag: 0.0.4
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    version: 0.0.4
+    tag: 0.0.4
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    version: 0.0.4
+    tag: 0.0.4
     # Tag history:
     # 0.0.4 - Version endpoint
     # 0.0.3 - Inference API Cleanup (#233)
@@ -35,22 +35,22 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    version: 0.0.4
+    tag: 0.0.4
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    version: 0.0.4
+    tag: 0.0.4
   - name: falcon-40b
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    version: 0.0.4
+    tag: 0.0.4
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    version: 0.0.4
+    tag: 0.0.4
     # Tag history:
     # 0.0.4 - Version endpoint
     # 0.0.3 - Update Default Params (#294)
@@ -67,7 +67,7 @@ models:
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
     runtime: tfs
-    version: 0.0.4
+    tag: 0.0.4
     # Tag history:
     # 0.0.4 - Version endpoint
     # 0.0.3 - Update Default Params (#294)
@@ -79,7 +79,7 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
     runtime: tfs
-    version: 0.0.3
+    tag: 0.0.3
     # Tag history:
     # 0.0.3 - Version endpoint
     # 0.0.2 - Update Default Params (#294)

From fcc19a13eb44e6b12a38de9d35e28282b2d7da94 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Tue, 2 Apr 2024 18:57:48 -0700
Subject: [PATCH 10/15] MODEL VERSION HASH

---
 docker/presets/llama-2/Dockerfile                    | 5 +++--
 docker/presets/tfs-onnx/Dockerfile                   | 5 +++--
 docker/presets/tfs/Dockerfile                        | 5 +++--
 presets/inference/llama2-chat/inference_api.py       | 7 ++++---
 presets/inference/llama2-completion/inference_api.py | 8 +++-----
 presets/inference/text-generation/inference_api.py   | 7 ++++---
 6 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 7c44a83d4..4fbd3e7e4 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -31,8 +31,9 @@ ARG VERSION
 ARG IMAGE_NAME
 ARG MODEL_VERSION
 
-# Write metadata to model_info.txt file
-RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/llama/model_info.txt
+# Write metadata to model_info.json file
+RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \
+    echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/llama/model_info.json
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index 27463d1a5..cd23ac54b 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -10,8 +10,9 @@ ARG MODEL_VERSION
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write metadata to model_info.txt file
-RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt
+# Write metadata to model_info.json file
+RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \
+    echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json
 
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index 39aebe1bf..13b9768ab 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -9,8 +9,9 @@ ARG MODEL_VERSION
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write metadata to model_info.txt file
-RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt
+# Write metadata to model_info.json file
+RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \
+    echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py
index b3a7a909c..a91786e53 100644
--- a/presets/inference/llama2-chat/inference_api.py
+++ b/presets/inference/llama2-chat/inference_api.py
@@ -8,6 +8,7 @@
 import signal
 import sys
 import threading
+import json
 from typing import Optional
 
 import GPUtil
@@ -19,7 +20,7 @@
 from pydantic import BaseModel
 
 # Constants
-MODEL_INFO = "model_info.txt"
+MODEL_INFO = "model_info.json"
 
 # Setup argparse
 parser = argparse.ArgumentParser(description="Llama API server.")
@@ -197,9 +198,9 @@ def get_metrics():
     @app_main.get("/version")
     def get_version():
         with open(f"/workspace/llama/{MODEL_INFO}", "r") as f:
-            model_name = f.read()
+            model_info = json.load(f)
 
-        return {"version": model_name}
+        return model_info
 
 def setup_worker_routes():
     @app_worker.get("/healthz")
diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py
index 4f70fbda1..f29ba91e1 100644
--- a/presets/inference/llama2-completion/inference_api.py
+++ b/presets/inference/llama2-completion/inference_api.py
@@ -20,7 +20,7 @@
 from pydantic import BaseModel
 
 # Constants
-MODEL_INFO = "model_info.txt"
+MODEL_INFO = "model_info.json"
 
 # Setup argparse
 parser = argparse.ArgumentParser(description="Llama API server.")
@@ -187,11 +187,9 @@ def get_metrics():
     @app_main.get("/version")
     def get_version():
         with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
-            model_name = f.read()
+            model_info = json.load(f)
 
-        # Convert readable text file to json
-        data = json.dumps(dict(map(str.strip, line.split(':'))) for line in model_name.split('\n'))
-        return data
+        return model_info
 
 def setup_worker_routes(): 
     @app_worker.get("/healthz")
diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index 06032e89a..8d92d53f3 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -3,6 +3,7 @@
 import os
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, List, Optional
+import json
 
 import GPUtil
 import torch
@@ -14,7 +15,7 @@
                           GenerationConfig, HfArgumentParser)
 
 # Constants
-MODEL_INFO = "model_info.txt"
+MODEL_INFO = "model_info.json"
 
 @dataclass
 class ModelConfig:
@@ -215,9 +216,9 @@ def get_metrics():
 @app.get("/version")
 def get_version():
     with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
-        model_name = f.read()
+        model_info = json.load(f)
 
-    return {"version": model_name}
+    return model_info
 
 if __name__ == "__main__":
     local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set

From 7582ee52e8a3e1af6a3bf6ff5cb68c8b7a07cf03 Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Thu, 4 Apr 2024 15:09:02 -0700
Subject: [PATCH 11/15] Get Hash

---
 docker/presets/llama-2/Dockerfile  | 2 +-
 docker/presets/tfs-onnx/Dockerfile | 2 +-
 docker/presets/tfs/Dockerfile      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index 4fbd3e7e4..19a869904 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -32,7 +32,7 @@ ARG IMAGE_NAME
 ARG MODEL_VERSION
 
 # Write metadata to model_info.json file
-RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \
+RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \
     echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/llama/model_info.json
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile
index cd23ac54b..8fdfc7440 100644
--- a/docker/presets/tfs-onnx/Dockerfile
+++ b/docker/presets/tfs-onnx/Dockerfile
@@ -11,7 +11,7 @@ ARG MODEL_VERSION
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.json file
-RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \
+RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \
     echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json
 
 # First, copy just the requirements.txt file and install dependencies
diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile
index 13b9768ab..863e40728 100644
--- a/docker/presets/tfs/Dockerfile
+++ b/docker/presets/tfs/Dockerfile
@@ -10,7 +10,7 @@ ARG MODEL_VERSION
 WORKDIR /workspace/tfs
 
 # Write metadata to model_info.json file
-RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \
+RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \
     echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json
 
 # First, copy just the preset files and install dependencies

From 9db1a0428bdef0990ef50bdecdcb84fee60679fe Mon Sep 17 00:00:00 2001
From: Evan <etredal@gmail.com>
Date: Thu, 4 Apr 2024 15:27:27 -0700
Subject: [PATCH 12/15] Version comments

---
 presets/models/supported_models.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index 900c4691b..ae57ff75d 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -25,7 +25,7 @@ models:
     runtime: llama-2
     tag: 0.0.4
     # Tag history:
-    # 0.0.4 - Version endpoint
+    # 0.0.4 - Version endpoint (#297)
     # 0.0.3 - Inference API Cleanup (#233)
     # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244)
     # 0.0.1 - Initial Release
@@ -42,7 +42,7 @@ models:
     runtime: tfs
     tag: 0.0.5
     # Tag history:
-    # 0.0.5 - Version endpoint
+    # 0.0.5 - Version endpoint (#297)
     # 0.0.4 - Adjust default model params (#310)
     # 0.0.3 - Update Default Params (#294)
     # 0.0.2 - Inference API Cleanup (#233)
@@ -59,7 +59,7 @@ models:
     runtime: tfs
     tag: 0.0.6
     # Tag history for 40b models:
-    # 0.0.6 - Version endpoint
+    # 0.0.6 - Version endpoint (#297)
     # 0.0.5 - Adjust default model params (#310)
     # 0.0.4 - Skipped due to incomplete upload issue
     # 0.0.3 - Update Default Params (#294)
@@ -78,7 +78,7 @@ models:
     runtime: tfs
     tag: 0.0.5
     # Tag history:
-    # 0.0.5 - Version endpoint
+    # 0.0.5 - Version endpoint (#297)
     # 0.0.4 - Adjust default model params (#310)
     # 0.0.3 - Update Default Params (#294)
     # 0.0.2 - Inference API Cleanup (#233)
@@ -91,7 +91,7 @@ models:
     runtime: tfs
     tag: 0.0.4
     # Tag history:
-    # 0.0.4 - Version endpoint
+    # 0.0.4 - Version endpoint (#297)
     # 0.0.3 - Adjust default model params (#310)
     # 0.0.2 - Update Default Params (#294)
     # 0.0.1 - Initial Release

From bf015c079289713f9b35e66b3c3f0fcb9f3d7c02 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Thu, 4 Apr 2024 17:15:27 -0700
Subject: [PATCH 13/15] fix: Checkout Evans awesome fork

---
 .github/workflows/e2e-preset-test.yml              |  6 +-----
 .github/workflows/kind-cluster/determine_models.py | 14 +++++++++++---
 .github/workflows/preset-image-build.yml           |  1 +
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index c22e7d6cc..3c2b1365d 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -48,6 +48,7 @@ jobs:
         run: |
             PR_BRANCH=${{ env.BRANCH_NAME }} \
             FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \
+            PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \
             python3 .github/workflows/kind-cluster/determine_models.py
 
       - name: Print Determined Models
@@ -274,11 +275,6 @@ jobs:
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
         run: |
             curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
-
-      - name: Test version endpoint
-        if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
-        run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version
     
       - name: Test inference endpoint
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py
index 5ace3ba63..18b5773e1 100644
--- a/.github/workflows/kind-cluster/determine_models.py
+++ b/.github/workflows/kind-cluster/determine_models.py
@@ -90,7 +90,7 @@ def models_to_build(files_changed):
                 seen_model_types.add(model_info["type"])
     return list(models)
 
-def check_modified_models(pr_branch):
+def check_modified_models(pr_branch, pr_repo_url):
     """Check for modified models in the repository."""
     repo_dir = Path.cwd() / "repo"
 
@@ -102,7 +102,14 @@ def check_modified_models(pr_branch):
 
     run_command("git checkout --detach")
     run_command("git fetch origin main:main")
-    run_command(f"git fetch origin {pr_branch}:{pr_branch}")
+
+    fetch_command = f"git fetch origin {pr_branch}:{pr_branch}"
+    if pr_repo_url != KAITO_REPO_URL:
+        # Add the PR's repo as a new remote only if it's different from the main repo
+        run_command("git remote add pr_repo {}".format(pr_repo_url))
+        fetch_command = f"git fetch pr_repo {pr_branch}"
+
+    run_command(fetch_command)
     run_command(f"git checkout {pr_branch}")
 
     files = run_command("git diff --name-only origin/main") # Returns each file on newline
@@ -118,6 +125,7 @@ def check_modified_models(pr_branch):
 def main():
     pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main'
     force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False
+    pr_repo_url = os.environ.get("PR_REPO_URL", KAITO_REPO_URL)
 
     affected_models = []
     if force_run_all != "false":
@@ -125,7 +133,7 @@ def main():
     else:
         # Logic to determine affected models
         # Example: affected_models = ['model1', 'model2', 'model3']
-        affected_models = check_modified_models(pr_branch)
+        affected_models = check_modified_models(pr_branch, pr_repo_url)
 
     # Convert the list of models into JSON matrix format
     matrix = create_matrix(affected_models)
diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml
index 5cdb8f98e..a5f100560 100644
--- a/.github/workflows/preset-image-build.yml
+++ b/.github/workflows/preset-image-build.yml
@@ -55,6 +55,7 @@ jobs:
         run: |
           PR_BRANCH=${{ env.BRANCH_NAME }} \
           FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \
+          PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \
           python3 .github/workflows/kind-cluster/determine_models.py
             
       - name: Print Determined Models

From 6b88a937f2e56ccbde416a70ea3e59ef166febf6 Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Thu, 4 Apr 2024 17:23:18 -0700
Subject: [PATCH 14/15] fix: Checkout Evans awesome fork

---
 .github/workflows/e2e-preset-test.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 3c2b1365d..f89d1660c 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -275,6 +275,11 @@ jobs:
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
         run: |
             curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
+
+      - name: Test version endpoint
+        if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
+        run: |
+            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version
     
       - name: Test inference endpoint
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')

From 81ce9c9c87d8bbfb12fc6c702e95f6b7b1426e1f Mon Sep 17 00:00:00 2001
From: ishaansehgal99 <ishaanforthewin@gmail.com>
Date: Thu, 4 Apr 2024 18:04:02 -0700
Subject: [PATCH 15/15] feat: Document version endpoint

---
 .../inference/text-generation/api_spec.json   | 1199 +++++++++--------
 .../text-generation/inference_api.py          |   77 +-
 2 files changed, 699 insertions(+), 577 deletions(-)

diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json
index 480fa97e4..8cdb9c16d 100644
--- a/presets/inference/text-generation/api_spec.json
+++ b/presets/inference/text-generation/api_spec.json
@@ -1,599 +1,658 @@
 {
     "openapi": "3.1.0",
     "info": {
-        "title": "FastAPI",
-        "version": "0.1.0"
+      "title": "FastAPI",
+      "version": "0.1.0"
     },
     "paths": {
-        "/": {
-            "get": {
-                "summary": "Home Endpoint",
-                "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.",
-                "operationId": "home__get",
-                "responses": {
-                    "200": {
-                        "description": "Successful Response",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/HomeResponse"
-                                }
-                            }
-                        }
-                    }
+      "/": {
+        "get": {
+          "summary": "Home Endpoint",
+          "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.",
+          "operationId": "home__get",
+          "responses": {
+            "200": {
+              "description": "Successful Response",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/HomeResponse"
+                  }
                 }
+              }
             }
-        },
-        "/healthz": {
-            "get": {
-                "summary": "Health Check Endpoint",
-                "operationId": "health_check_healthz_get",
-                "responses": {
-                    "200": {
-                        "description": "Successful Response",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/HealthStatus"
-                                },
-                                "example": {
-                                    "status": "Healthy"
-                                }
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Error Response",
-                        "content": {
-                            "application/json": {
-                                "examples": {
-                                    "model_uninitialized": {
-                                        "summary": "Model not initialized",
-                                        "value": {
-                                            "detail": "Model not initialized"
-                                        }
-                                    },
-                                    "pipeline_uninitialized": {
-                                        "summary": "Pipeline not initialized",
-                                        "value": {
-                                            "detail": "Pipeline not initialized"
-                                        }
-                                    }
-                                }
-                            }
-                        }
+          }
+        }
+      },
+      "/healthz": {
+        "get": {
+          "summary": "Health Check Endpoint",
+          "operationId": "health_check_healthz_get",
+          "responses": {
+            "200": {
+              "description": "Successful Response",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/HealthStatus"
+                  },
+                  "example": {
+                    "status": "Healthy"
+                  }
+                }
+              }
+            },
+            "500": {
+              "description": "Error Response",
+              "content": {
+                "application/json": {
+                  "examples": {
+                    "model_uninitialized": {
+                      "summary": "Model not initialized",
+                      "value": {
+                        "detail": "Model not initialized"
+                      }
+                    },
+                    "pipeline_uninitialized": {
+                      "summary": "Pipeline not initialized",
+                      "value": {
+                        "detail": "Pipeline not initialized"
+                      }
                     }
+                  }
                 }
+              }
             }
-        },
-        "/chat": {
-            "post": {
-                "summary": "Chat Endpoint",
-                "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.",
-                "operationId": "generate_text_chat_post",
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/UnifiedRequestModel"
-                            },
-                            "examples": {
-                                "text_generation_example": {
-                                    "summary": "Text Generation Example",
-                                    "description": "An example of a text generation request.",
-                                    "value": {
-                                        "prompt": "Tell me a joke",
-                                        "return_full_text": true,
-                                        "clean_up_tokenization_spaces": false,
-                                        "generate_kwargs": {
-                                            "max_length": 200,
-                                            "min_length": 0,
-                                            "do_sample": true,
-                                            "early_stopping": false,
-                                            "num_beams": 1,
-                                            "temperature": 1,
-                                            "top_k": 10,
-                                            "top_p": 1,
-                                            "typical_p": 1,
-                                            "repetition_penalty": 1,
-                                            "eos_token_id": 11
-                                        }
-                                    }
-                                },
-                                "conversation_example": {
-                                    "summary": "Conversation Example",
-                                    "description": "An example of a conversational request.",
-                                    "value": {
-                                        "messages": [
-                                            {
-                                                "role": "user",
-                                                "content": "What is your favourite condiment?"
-                                            },
-                                            {
-                                                "role": "assistant",
-                                                "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!"
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": "Do you have mayonnaise recipes?"
-                                            }
-                                        ],
-                                        "return_full_text": true,
-                                        "clean_up_tokenization_spaces": false,
-                                        "generate_kwargs": {
-                                            "max_length": 200,
-                                            "min_length": 0,
-                                            "do_sample": true,
-                                            "early_stopping": false,
-                                            "num_beams": 1,
-                                            "temperature": 1,
-                                            "top_k": 10,
-                                            "top_p": 1,
-                                            "typical_p": 1,
-                                            "repetition_penalty": 1,
-                                            "eos_token_id": 11
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    },
-                    "required": true
+          }
+        }
+      },
+      "/chat": {
+        "post": {
+          "summary": "Chat Endpoint",
+          "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.",
+          "operationId": "generate_text_chat_post",
+          "requestBody": {
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/UnifiedRequestModel"
                 },
-                "responses": {
-                    "200": {
-                        "description": "Successful Response",
-                        "content": {
-                            "application/json": {
-                                "schema": {},
-                                "examples": {
-                                    "text_generation": {
-                                        "summary": "Text Generation Response",
-                                        "value": {
-                                            "Result": "Generated text based on the prompt."
-                                        }
-                                    },
-                                    "conversation": {
-                                        "summary": "Conversation Response",
-                                        "value": {
-                                            "Result": "Response to the last message in the conversation."
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "description": "Validation Error",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ErrorResponse"
-                                },
-                                "examples": {
-                                    "missing_prompt": {
-                                        "summary": "Missing Prompt",
-                                        "value": {
-                                            "detail": "Text generation parameter prompt required"
-                                        }
-                                    },
-                                    "missing_messages": {
-                                        "summary": "Missing Messages",
-                                        "value": {
-                                            "detail": "Conversational parameter messages required"
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    },
-                    "422": {
-                        "description": "Validation Error",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/HTTPValidationError"
-                                }
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Internal Server Error",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ErrorResponse"
-                                }
-                            }
+                "examples": {
+                  "text_generation_example": {
+                    "summary": "Text Generation Example",
+                    "description": "An example of a text generation request.",
+                    "value": {
+                      "prompt": "Tell me a joke",
+                      "return_full_text": true,
+                      "clean_up_tokenization_spaces": false,
+                      "generate_kwargs": {
+                        "max_length": 200,
+                        "min_length": 0,
+                        "do_sample": true,
+                        "early_stopping": false,
+                        "num_beams": 1,
+                        "temperature": 1,
+                        "top_k": 10,
+                        "top_p": 1,
+                        "typical_p": 1,
+                        "repetition_penalty": 1,
+                        "eos_token_id": 11
+                      }
+                    }
+                  },
+                  "conversation_example": {
+                    "summary": "Conversation Example",
+                    "description": "An example of a conversational request.",
+                    "value": {
+                      "messages": [
+                        {
+                          "role": "user",
+                          "content": "What is your favourite condiment?"
+                        },
+                        {
+                          "role": "assistant",
+                          "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!"
+                        },
+                        {
+                          "role": "user",
+                          "content": "Do you have mayonnaise recipes?"
                         }
+                      ],
+                      "return_full_text": true,
+                      "clean_up_tokenization_spaces": false,
+                      "generate_kwargs": {
+                        "max_length": 200,
+                        "min_length": 0,
+                        "do_sample": true,
+                        "early_stopping": false,
+                        "num_beams": 1,
+                        "temperature": 1,
+                        "top_k": 10,
+                        "top_p": 1,
+                        "typical_p": 1,
+                        "repetition_penalty": 1,
+                        "eos_token_id": 11
+                      }
+                    }
+                  }
+                }
+              }
+            },
+            "required": true
+          },
+          "responses": {
+            "200": {
+              "description": "Successful Response",
+              "content": {
+                "application/json": {
+                  "schema": {},
+                  "examples": {
+                    "text_generation": {
+                      "summary": "Text Generation Response",
+                      "value": {
+                        "Result": "Generated text based on the prompt."
+                      }
+                    },
+                    "conversation": {
+                      "summary": "Conversation Response",
+                      "value": {
+                        "Result": "Response to the last message in the conversation."
+                      }
+                    }
+                  }
+                }
+              }
+            },
+            "400": {
+              "description": "Validation Error",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/ErrorResponse"
+                  },
+                  "examples": {
+                    "missing_prompt": {
+                      "summary": "Missing Prompt",
+                      "value": {
+                        "detail": "Text generation parameter prompt required"
+                      }
+                    },
+                    "missing_messages": {
+                      "summary": "Missing Messages",
+                      "value": {
+                        "detail": "Conversational parameter messages required"
+                      }
                     }
+                  }
                 }
+              }
+            },
+            "422": {
+              "description": "Validation Error",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/HTTPValidationError"
+                  }
+                }
+              }
+            },
+            "500": {
+              "description": "Internal Server Error",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/ErrorResponse"
+                  }
+                }
+              }
             }
-        },
-        "/metrics": {
-            "get": {
-                "summary": "Metrics Endpoint",
-                "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.",
-                "operationId": "get_metrics_metrics_get",
-                "responses": {
-                    "200": {
-                        "description": "Successful Response",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/MetricsResponse"
-                                },
-                                "examples": {
-                                    "gpu_metrics": {
-                                        "summary": "Example when GPUs are available",
-                                        "value": {
-                                            "gpu_info": [
-                                                {
-                                                    "id": "GPU-1234",
-                                                    "name": "GeForce GTX 950",
-                                                    "load": "25.00%",
-                                                    "temperature": "55 C",
-                                                    "memory": {
-                                                        "used": "1.00 GB",
-                                                        "total": "2.00 GB"
-                                                    }
-                                                }
-                                            ]
-                                        }
-                                    },
-                                    "cpu_metrics": {
-                                        "summary": "Example when only CPU is available",
-                                        "value": {
-                                            "cpu_info": {
-                                                "load_percentage": 20,
-                                                "physical_cores": 4,
-                                                "total_cores": 8,
-                                                "memory": {
-                                                    "used": "4.00 GB",
-                                                    "total": "16.00 GB"
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Internal Server Error",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ErrorResponse"
-                                }
+          }
+        }
+      },
+      "/metrics": {
+        "get": {
+          "summary": "Metrics Endpoint",
+          "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.",
+          "operationId": "get_metrics_metrics_get",
+          "responses": {
+            "200": {
+              "description": "Successful Response",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/MetricsResponse"
+                  },
+                  "examples": {
+                    "gpu_metrics": {
+                      "summary": "Example when GPUs are available",
+                      "value": {
+                        "gpu_info": [
+                          {
+                            "id": "GPU-1234",
+                            "name": "GeForce GTX 950",
+                            "load": "25.00%",
+                            "temperature": "55 C",
+                            "memory": {
+                              "used": "1.00 GB",
+                              "total": "2.00 GB"
                             }
+                          }
+                        ]
+                      }
+                    },
+                    "cpu_metrics": {
+                      "summary": "Example when only CPU is available",
+                      "value": {
+                        "cpu_info": {
+                          "load_percentage": 20,
+                          "physical_cores": 4,
+                          "total_cores": 8,
+                          "memory": {
+                            "used": "4.00 GB",
+                            "total": "16.00 GB"
+                          }
                         }
+                      }
                     }
+                  }
+                }
+              }
+            },
+            "500": {
+              "description": "Internal Server Error",
+              "content": {
+                "application/json": {
+                  "schema": {
+                    "$ref": "#/components/schemas/ErrorResponse"
+                  }
                 }
+              }
             }
+          }
         }
-    },
-    "components": {
-        "schemas": {
-            "CPUInfo": {
-                "properties": {
-                    "load_percentage": {
-                        "type": "number",
-                        "title": "Load Percentage"
-                    },
-                    "physical_cores": {
-                        "type": "integer",
-                        "title": "Physical Cores"
-                    },
-                    "total_cores": {
-                        "type": "integer",
-                        "title": "Total Cores"
-                    },
-                    "memory": {
-                        "$ref": "#/components/schemas/MemoryInfo"
+      },
+      "/version": {
+        "get": {
+          "summary": "Get Model Information",
+          "description": "Reads and returns model version information from a predefined JSON file.",
+          "operationId": "get_version_version_get",
+          "responses": {
+            "200": {
+              "description": "Successful Response",
+              "content": {
+                "application/json": {
+                  "schema": {},
+                  "examples": {
+                    "model_info": {
+                      "summary": "Model Information Response",
+                      "value": {
+                        "Model Type": "Your Model Type",
+                        "Version": "1.0.0",
+                        "Image Name": "model_image_name",
+                        "Model Version URL": "http://example.com/model/version",
+                        "REVISION_ID": "revision_hash"
+                      }
                     }
-                },
-                "type": "object",
-                "required": [
-                    "load_percentage",
-                    "physical_cores",
-                    "total_cores",
-                    "memory"
-                ],
-                "title": "CPUInfo"
-            },
-            "ErrorResponse": {
-                "properties": {
-                    "detail": {
-                        "type": "string",
-                        "title": "Detail"
-                    }
-                },
-                "type": "object",
-                "required": [
-                    "detail"
-                ],
-                "title": "ErrorResponse"
-            },
-            "GPUInfo": {
-                "properties": {
-                    "id": {
-                        "type": "string",
-                        "title": "Id"
-                    },
-                    "name": {
-                        "type": "string",
-                        "title": "Name"
-                    },
-                    "load": {
-                        "type": "string",
-                        "title": "Load"
-                    },
-                    "temperature": {
-                        "type": "string",
-                        "title": "Temperature"
-                    },
-                    "memory": {
-                        "$ref": "#/components/schemas/MemoryInfo"
-                    }
-                },
-                "type": "object",
-                "required": [
-                    "id",
-                    "name",
-                    "load",
-                    "temperature",
-                    "memory"
-                ],
-                "title": "GPUInfo"
-            },
-            "GenerateKwargs": {
-                "properties": {
-                    "max_length": {
-                        "type": "integer",
-                        "title": "Max Length",
-                        "default": 200
-                    },
-                    "min_length": {
-                        "type": "integer",
-                        "title": "Min Length",
-                        "default": 0
-                    },
-                    "do_sample": {
-                        "type": "boolean",
-                        "title": "Do Sample",
-                        "default": true
-                    },
-                    "early_stopping": {
-                        "type": "boolean",
-                        "title": "Early Stopping",
-                        "default": false
-                    },
-                    "num_beams": {
-                        "type": "integer",
-                        "title": "Num Beams",
-                        "default": 1
-                    },
-                    "temperature": {
-                        "type": "number",
-                        "title": "Temperature",
-                        "default": 1
-                    },
-                    "top_k": {
-                        "type": "integer",
-                        "title": "Top K",
-                        "default": 10
-                    },
-                    "top_p": {
-                        "type": "number",
-                        "title": "Top P",
-                        "default": 1
-                    },
-                    "typical_p": {
-                        "type": "number",
-                        "title": "Typical P",
-                        "default": 1
-                    },
-                    "repetition_penalty": {
-                        "type": "number",
-                        "title": "Repetition Penalty",
-                        "default": 1
-                    },
-                    "pad_token_id": {
-                        "type": "integer",
-                        "title": "Pad Token Id"
-                    },
-                    "eos_token_id": {
-                        "type": "integer",
-                        "title": "Eos Token Id",
-                        "default": 11
-                    }
-                },
-                "type": "object",
-                "title": "GenerateKwargs",
-                "example": {
-                    "max_length": 200,
-                    "temperature": 0.7,
-                    "top_p": 0.9,
-                    "additional_param": "Example value"
+                  }
                 }
+              }
             },
-            "HTTPValidationError": {
-                "properties": {
-                    "detail": {
-                        "items": {
-                            "$ref": "#/components/schemas/ValidationError"
-                        },
-                        "type": "array",
-                        "title": "Detail"
-                    }
-                },
-                "type": "object",
-                "title": "HTTPValidationError"
-            },
-            "HealthStatus": {
-                "properties": {
-                    "status": {
-                        "type": "string",
-                        "title": "Status",
-                        "example": "Healthy"
-                    }
-                },
-                "type": "object",
-                "required": [
-                    "status"
-                ],
-                "title": "HealthStatus"
-            },
-            "HomeResponse": {
-                "properties": {
-                    "message": {
-                        "type": "string",
-                        "title": "Message",
-                        "example": "Server is running"
-                    }
-                },
-                "type": "object",
-                "required": [
-                    "message"
-                ],
-                "title": "HomeResponse"
-            },
-            "MemoryInfo": {
-                "properties": {
-                    "used": {
-                        "type": "string",
-                        "title": "Used"
-                    },
-                    "total": {
-                        "type": "string",
-                        "title": "Total"
+            "404": {
+              "description": "Model Info Not Found",
+              "content": {
+                "application/json": {
+                  "examples": {
+                    "file_not_found": {
+                      "summary": "Model Info File Not Found",
+                      "value": {
+                        "detail": "/workspace/tfs/model_info.json file not found."
+                      }
                     }
-                },
-                "type": "object",
-                "required": [
-                    "used",
-                    "total"
-                ],
-                "title": "MemoryInfo"
-            },
-            "Message": {
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "title": "Role"
-                    },
-                    "content": {
-                        "type": "string",
-                        "title": "Content"
-                    }
-                },
-                "type": "object",
-                "required": [
-                    "role",
-                    "content"
-                ],
-                "title": "Message"
-            },
-            "MetricsResponse": {
-                "properties": {
-                    "gpu_info": {
-                        "items": {
-                            "$ref": "#/components/schemas/GPUInfo"
-                        },
-                        "type": "array",
-                        "title": "Gpu Info"
-                    },
-                    "cpu_info": {
-                        "$ref": "#/components/schemas/CPUInfo"
-                    }
-                },
-                "type": "object",
-                "title": "MetricsResponse"
-            },
-            "UnifiedRequestModel": {
-                "properties": {
-                    "prompt": {
-                        "type": "string",
-                        "title": "Prompt",
-                        "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'."
-                    },
-                    "return_full_text": {
-                        "type": "boolean",
-                        "title": "Return Full Text",
-                        "description": "Return full text if True, else only added text",
-                        "default": true
-                    },
-                    "clean_up_tokenization_spaces": {
-                        "type": "boolean",
-                        "title": "Clean Up Tokenization Spaces",
-                        "description": "Clean up extra spaces in text output",
-                        "default": false
-                    },
-                    "prefix": {
-                        "type": "string",
-                        "title": "Prefix",
-                        "description": "Prefix added to prompt"
-                    },
-                    "handle_long_generation": {
-                        "type": "string",
-                        "title": "Handle Long Generation",
-                        "description": "Strategy to handle long generation"
-                    },
-                    "generate_kwargs": {
-                        "allOf": [
-                            {
-                                "$ref": "#/components/schemas/GenerateKwargs"
-                            }
-                        ],
-                        "title": "Generate Kwargs",
-                        "description": "Additional kwargs for generate method"
-                    },
-                    "messages": {
-                        "items": {
-                            "$ref": "#/components/schemas/Message"
-                        },
-                        "type": "array",
-                        "title": "Messages",
-                        "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'."
-                    }
-                },
-                "type": "object",
-                "title": "UnifiedRequestModel"
-            },
-            "ValidationError": {
-                "properties": {
-                    "loc": {
-                        "items": {
-                            "anyOf": [
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "integer"
-                                }
-                            ]
-                        },
-                        "type": "array",
-                        "title": "Location"
-                    },
-                    "msg": {
-                        "type": "string",
-                        "title": "Message"
-                    },
-                    "type": {
-                        "type": "string",
-                        "title": "Error Type"
+                  }
+                }
+              }
+            },
+            "500": {
+              "description": "Internal Server Error",
+              "content": {
+                "application/json": {
+                  "examples": {
+                    "unexpected_error": {
+                      "summary": "Unexpected Error",
+                      "value": {
+                        "detail": "An unexpected error occurred on the server."
+                      }
                     }
-                },
-                "type": "object",
-                "required": [
-                    "loc",
-                    "msg",
-                    "type"
-                ],
-                "title": "ValidationError"
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "components": {
+      "schemas": {
+        "CPUInfo": {
+          "properties": {
+            "load_percentage": {
+              "type": "number",
+              "title": "Load Percentage"
+            },
+            "physical_cores": {
+              "type": "integer",
+              "title": "Physical Cores"
+            },
+            "total_cores": {
+              "type": "integer",
+              "title": "Total Cores"
+            },
+            "memory": {
+              "$ref": "#/components/schemas/MemoryInfo"
+            }
+          },
+          "type": "object",
+          "required": [
+            "load_percentage",
+            "physical_cores",
+            "total_cores",
+            "memory"
+          ],
+          "title": "CPUInfo"
+        },
+        "ErrorResponse": {
+          "properties": {
+            "detail": {
+              "type": "string",
+              "title": "Detail"
+            }
+          },
+          "type": "object",
+          "required": [
+            "detail"
+          ],
+          "title": "ErrorResponse"
+        },
+        "GPUInfo": {
+          "properties": {
+            "id": {
+              "type": "string",
+              "title": "Id"
+            },
+            "name": {
+              "type": "string",
+              "title": "Name"
+            },
+            "load": {
+              "type": "string",
+              "title": "Load"
+            },
+            "temperature": {
+              "type": "string",
+              "title": "Temperature"
+            },
+            "memory": {
+              "$ref": "#/components/schemas/MemoryInfo"
+            }
+          },
+          "type": "object",
+          "required": [
+            "id",
+            "name",
+            "load",
+            "temperature",
+            "memory"
+          ],
+          "title": "GPUInfo"
+        },
+        "GenerateKwargs": {
+          "properties": {
+            "max_length": {
+              "type": "integer",
+              "title": "Max Length",
+              "default": 200
+            },
+            "min_length": {
+              "type": "integer",
+              "title": "Min Length",
+              "default": 0
+            },
+            "do_sample": {
+              "type": "boolean",
+              "title": "Do Sample",
+              "default": true
+            },
+            "early_stopping": {
+              "type": "boolean",
+              "title": "Early Stopping",
+              "default": false
+            },
+            "num_beams": {
+              "type": "integer",
+              "title": "Num Beams",
+              "default": 1
+            },
+            "temperature": {
+              "type": "number",
+              "title": "Temperature",
+              "default": 1
+            },
+            "top_k": {
+              "type": "integer",
+              "title": "Top K",
+              "default": 10
+            },
+            "top_p": {
+              "type": "number",
+              "title": "Top P",
+              "default": 1
+            },
+            "typical_p": {
+              "type": "number",
+              "title": "Typical P",
+              "default": 1
+            },
+            "repetition_penalty": {
+              "type": "number",
+              "title": "Repetition Penalty",
+              "default": 1
+            },
+            "pad_token_id": {
+              "type": "integer",
+              "title": "Pad Token Id"
+            },
+            "eos_token_id": {
+              "type": "integer",
+              "title": "Eos Token Id",
+              "default": 11
+            }
+          },
+          "type": "object",
+          "title": "GenerateKwargs",
+          "example": {
+            "max_length": 200,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "additional_param": "Example value"
+          }
+        },
+        "HTTPValidationError": {
+          "properties": {
+            "detail": {
+              "items": {
+                "$ref": "#/components/schemas/ValidationError"
+              },
+              "type": "array",
+              "title": "Detail"
+            }
+          },
+          "type": "object",
+          "title": "HTTPValidationError"
+        },
+        "HealthStatus": {
+          "properties": {
+            "status": {
+              "type": "string",
+              "title": "Status",
+              "example": "Healthy"
+            }
+          },
+          "type": "object",
+          "required": [
+            "status"
+          ],
+          "title": "HealthStatus"
+        },
+        "HomeResponse": {
+          "properties": {
+            "message": {
+              "type": "string",
+              "title": "Message",
+              "example": "Server is running"
+            }
+          },
+          "type": "object",
+          "required": [
+            "message"
+          ],
+          "title": "HomeResponse"
+        },
+        "MemoryInfo": {
+          "properties": {
+            "used": {
+              "type": "string",
+              "title": "Used"
+            },
+            "total": {
+              "type": "string",
+              "title": "Total"
+            }
+          },
+          "type": "object",
+          "required": [
+            "used",
+            "total"
+          ],
+          "title": "MemoryInfo"
+        },
+        "Message": {
+          "properties": {
+            "role": {
+              "type": "string",
+              "title": "Role"
+            },
+            "content": {
+              "type": "string",
+              "title": "Content"
+            }
+          },
+          "type": "object",
+          "required": [
+            "role",
+            "content"
+          ],
+          "title": "Message"
+        },
+        "MetricsResponse": {
+          "properties": {
+            "gpu_info": {
+              "items": {
+                "$ref": "#/components/schemas/GPUInfo"
+              },
+              "type": "array",
+              "title": "Gpu Info"
+            },
+            "cpu_info": {
+              "$ref": "#/components/schemas/CPUInfo"
+            }
+          },
+          "type": "object",
+          "title": "MetricsResponse"
+        },
+        "UnifiedRequestModel": {
+          "properties": {
+            "prompt": {
+              "type": "string",
+              "title": "Prompt",
+              "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'."
+            },
+            "return_full_text": {
+              "type": "boolean",
+              "title": "Return Full Text",
+              "description": "Return full text if True, else only added text",
+              "default": true
+            },
+            "clean_up_tokenization_spaces": {
+              "type": "boolean",
+              "title": "Clean Up Tokenization Spaces",
+              "description": "Clean up extra spaces in text output",
+              "default": false
+            },
+            "prefix": {
+              "type": "string",
+              "title": "Prefix",
+              "description": "Prefix added to prompt"
+            },
+            "handle_long_generation": {
+              "type": "string",
+              "title": "Handle Long Generation",
+              "description": "Strategy to handle long generation"
+            },
+            "generate_kwargs": {
+              "allOf": [
+                {
+                  "$ref": "#/components/schemas/GenerateKwargs"
+                }
+              ],
+              "title": "Generate Kwargs",
+              "description": "Additional kwargs for generate method"
+            },
+            "messages": {
+              "items": {
+                "$ref": "#/components/schemas/Message"
+              },
+              "type": "array",
+              "title": "Messages",
+              "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'."
+            }
+          },
+          "type": "object",
+          "title": "UnifiedRequestModel"
+        },
+        "ValidationError": {
+          "properties": {
+            "loc": {
+              "items": {
+                "anyOf": [
+                  {
+                    "type": "string"
+                  },
+                  {
+                    "type": "integer"
+                  }
+                ]
+              },
+              "type": "array",
+              "title": "Location"
+            },
+            "msg": {
+              "type": "string",
+              "title": "Message"
+            },
+            "type": {
+              "type": "string",
+              "title": "Error Type"
             }
+          },
+          "type": "object",
+          "required": [
+            "loc",
+            "msg",
+            "type"
+          ],
+          "title": "ValidationError"
         }
+      }
     }
-}
\ No newline at end of file
+  }
\ No newline at end of file
diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index 8353381f9..23dccba5d 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+import json
 import os
 from dataclasses import asdict, dataclass, field
-import json
 from typing import Annotated, Any, Dict, List, Optional
 
 import GPUtil
@@ -17,7 +17,9 @@
                           GenerationConfig, HfArgumentParser)
 
 # Constants
-MODEL_INFO = "model_info.json"
+APP_DIR = "/workspace/tfs"
+WEIGHTS_DIR = f"{APP_DIR}/weights"
+MODEL_INFO_FILE = f"{APP_DIR}/model_info.json"
 
 @dataclass
 class ModelConfig:
@@ -25,7 +27,7 @@ class ModelConfig:
     Transformers Model Configuration Parameters
     """
     pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"})
-    pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"})
+    pretrained_model_name_or_path: Optional[str] = field(default=WEIGHTS_DIR, metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"})
     state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"})
     cache_dir: Optional[str] = field(default=None, metadata={"help": "Cache directory for the model"})
     from_tf: bool = field(default=False, metadata={"help": "Load model from a TensorFlow checkpoint"})
@@ -431,11 +433,72 @@ def get_metrics():
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
-@app.get("/version")
+@app.get(
+    "/version",
+    summary="Get Model Information",
+    response_description="Model Version Information",
+    responses={
+        200: {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "examples": {
+                        "model_info": {
+                            "summary": "Model Information Response",
+                            "value": {
+                                "Model Type": "Your Model Type",
+                                "Version": "1.0.0",
+                                "Image Name": "model_image_name",
+                                "Model Version URL": "http://example.com/model/version",
+                                "REVISION_ID": "revision_hash"
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        404: {
+            "description": "Model Info Not Found",
+            "content": {
+                "application/json": {
+                    "examples": {
+                        "file_not_found": {
+                            "summary": "Model Info File Not Found",
+                            "value": {"detail": f"{MODEL_INFO_FILE} file not found."}
+                        }
+                    }
+                }
+            }
+        },
+        500: {
+            "description": "Internal Server Error",
+            "content": {
+                "application/json": {
+                    "examples": {
+                        "unexpected_error": {
+                            "summary": "Unexpected Error",
+                            "value": {
+                                "detail": "An unexpected error occurred on the server."
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+)
 def get_version():
-    with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
-        model_info = json.load(f)
-
+    """
+    Reads and returns model version information from a predefined JSON file.
+    """
+    try:
+        with open(MODEL_INFO_FILE, "r") as f:
+            model_info = json.load(f)
+    except FileNotFoundError:
+        raise HTTPException(status_code=404, detail="model_info.json file not found.")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    
     return model_info
 
 if __name__ == "__main__":