From 624f47929d2a4d18475fa6366eb807491c251fdb Mon Sep 17 00:00:00 2001 From: Evan Date: Mon, 11 Mar 2024 22:48:32 -0700 Subject: [PATCH 01/15] Initial Dockerfile and fastapi implementation --- docker/presets/llama-2/Dockerfile | 5 ++++- docker/presets/tfs-onnx/Dockerfile | 4 +++- docker/presets/tfs/Dockerfile | 4 +++- presets/inference/llama2-chat/inference_api.py | 7 +++++++ presets/inference/llama2-completion/inference_api.py | 7 +++++++ presets/inference/text-generation/inference_api.py | 7 +++++++ 6 files changed, 31 insertions(+), 3 deletions(-) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 641d158bc..537d501a9 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -26,8 +26,11 @@ RUN pip install 'uvicorn[standard]' ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION -# Write the version to a file +ARG IMAGE_NAME + +# Write metadata to .txt files RUN echo $VERSION > /workspace/llama/version.txt +RUN echo $IMAGE_NAME > /workspace/llama/model_name.txt ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index 12e788346..41ee80f0b 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -4,12 +4,14 @@ FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu118-py38-torch211 ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION +ARG IMAGE_NAME # Set the working directory WORKDIR /workspace/tfs -# Write the version to a file +# Write metadata to .txt files RUN echo $VERSION > /workspace/tfs/version.txt +RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt # First, copy just the requirements.txt file and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index 5a322b8bd..2bb446342 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -3,12 +3,14 @@ FROM python:3.10-slim ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION +ARG IMAGE_NAME # Set the working directory WORKDIR /workspace/tfs -# Write the version to a file +# Write metadata to .txt files RUN echo $VERSION > /workspace/tfs/version.txt +RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py index 11776bf3d..9691db010 100644 --- a/presets/inference/llama2-chat/inference_api.py +++ b/presets/inference/llama2-chat/inference_api.py @@ -191,6 +191,13 @@ def get_metrics(): except Exception as e: return {"error": str(e)} + @app_main.get("/version") + def health_check(): + with open("/workspace/llama/model_name.txt", "r") as f: + model_name = f.read() + + return {"version": model_name} + def setup_worker_routes(): @app_worker.get("/healthz") def health_check(): diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py index cf500146a..9adb230c1 100644 --- a/presets/inference/llama2-completion/inference_api.py +++ b/presets/inference/llama2-completion/inference_api.py @@ -180,6 +180,13 @@ def get_metrics(): except Exception as e: return {"error": str(e)} + @app_main.get("/version") + def health_check(): + with open("/workspace/tfs/model_name.txt", "r") as f: + model_name = f.read() + + return {"version": model_name} + def setup_worker_routes(): @app_worker.get("/healthz") def health_check(): diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index 73c7b5095..ad713d075 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -222,6 +222,13 @@ def get_metrics(): except Exception as e: return {"error": str(e)} +@app.get("/version") +def health_check(): + with open("/workspace/tfs/model_name.txt", "r") as f: + model_name = f.read() + + return {"version": model_name} + if __name__ == "__main__": local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set port = 5000 + local_rank # Adjust port based on local rank From 484c0addc7819318fb539ecd7ad7c9b50356702c Mon Sep 17 00:00:00 2001 From: Evan Date: Wed, 13 Mar 2024 13:56:25 -0700 Subject: [PATCH 02/15] Rename, add constants, template update --- .github/workflows/kind-cluster/docker-job-template.yaml | 1 + docker/presets/llama-2/Dockerfile | 5 ++--- docker/presets/tfs-onnx/Dockerfile | 5 ++--- docker/presets/tfs/Dockerfile | 5 ++--- presets/inference/llama2-chat/inference_api.py | 7 +++++-- presets/inference/llama2-completion/inference_api.py | 7 +++++-- presets/inference/text-generation/inference_api.py | 6 ++++-- 7 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/workflows/kind-cluster/docker-job-template.yaml b/.github/workflows/kind-cluster/docker-job-template.yaml index a19860f88..64d86da54 100644 --- a/.github/workflows/kind-cluster/docker-job-template.yaml +++ b/.github/workflows/kind-cluster/docker-job-template.yaml @@ -43,6 +43,7 @@ spec: --build-arg WEIGHTS_PATH=/weights \ --build-arg VERSION={{VERSION}} \ --build-arg MODEL_TYPE={{MODEL_TYPE}} \ + --build-arg IMAGE_NAME={{IMAGE_NAME}} \ -f $DOCKERFILE_PATH / docker push $ACR_NAME.azurecr.io/{{IMAGE_NAME}}:$VERSION env: diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 537d501a9..e26e64043 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -28,9 +28,8 @@ ARG MODEL_TYPE ARG VERSION ARG IMAGE_NAME -# Write metadata to .txt files -RUN echo $VERSION > /workspace/llama/version.txt -RUN echo $IMAGE_NAME > /workspace/llama/model_name.txt +# Write metadata to model_info.txt file +RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index 41ee80f0b..ed39e3555 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -9,9 +9,8 @@ ARG IMAGE_NAME # Set the working directory WORKDIR /workspace/tfs -# Write metadata to .txt files -RUN echo $VERSION > /workspace/tfs/version.txt -RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt +# Write metadata to model_info.txt file +RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt # First, copy just the requirements.txt file and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index 2bb446342..a8d5e7587 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -8,9 +8,8 @@ ARG IMAGE_NAME # Set the working directory WORKDIR /workspace/tfs -# Write metadata to .txt files -RUN echo $VERSION > /workspace/tfs/version.txt -RUN echo $IMAGE_NAME > /workspace/tfs/model_name.txt +# Write metadata to model_info.txt file +RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py index 9691db010..b3a7a909c 100644 --- a/presets/inference/llama2-chat/inference_api.py +++ b/presets/inference/llama2-chat/inference_api.py @@ -18,6 +18,9 @@ from llama import Llama from pydantic import BaseModel +# Constants +MODEL_INFO = "model_info.txt" + # Setup argparse parser = argparse.ArgumentParser(description="Llama API server.") parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.") @@ -192,8 +195,8 @@ def get_metrics(): return {"error": str(e)} @app_main.get("/version") - def health_check(): - with open("/workspace/llama/model_name.txt", "r") as f: + def get_version(): + with open(f"/workspace/llama/{MODEL_INFO}", "r") as f: model_name = f.read() return {"version": model_name} diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py index 9adb230c1..612b4e789 100644 --- a/presets/inference/llama2-completion/inference_api.py +++ b/presets/inference/llama2-completion/inference_api.py @@ -18,6 +18,9 @@ from llama import Llama from pydantic import BaseModel +# Constants +MODEL_INFO = "model_info.txt" + # Setup argparse parser = argparse.ArgumentParser(description="Llama API server.") parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.") @@ -181,8 +184,8 @@ def get_metrics(): return {"error": str(e)} @app_main.get("/version") - def health_check(): - with open("/workspace/tfs/model_name.txt", "r") as f: + def get_version(): + with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: model_name = f.read() return {"version": model_name} diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index ad713d075..b50a5884e 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -13,6 +13,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, GenerationConfig, HfArgumentParser) +# Constants +MODEL_INFO = "model_info.txt" @dataclass class ModelConfig: @@ -223,8 +225,8 @@ def get_metrics(): return {"error": str(e)} @app.get("/version") -def health_check(): - with open("/workspace/tfs/model_name.txt", "r") as f: +def get_version(): + with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: model_name = f.read() return {"version": model_name} From 031735db71cffad287da7505a76508b32c30e8a1 Mon Sep 17 00:00:00 2001 From: Evan Date: Wed, 13 Mar 2024 17:54:12 -0700 Subject: [PATCH 03/15] Fix formatting --- docker/presets/llama-2/Dockerfile | 2 +- docker/presets/tfs-onnx/Dockerfile | 2 +- docker/presets/tfs/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index e26e64043..e5854f805 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -29,7 +29,7 @@ ARG VERSION ARG IMAGE_NAME # Write metadata to model_info.txt file -RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt +RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/llama/model_info.txt ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index ed39e3555..a13dabe28 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -10,7 +10,7 @@ ARG IMAGE_NAME WORKDIR /workspace/tfs # Write metadata to model_info.txt file -RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt +RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt # First, copy just the requirements.txt file and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index a8d5e7587..4f04df4c3 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -9,7 +9,7 @@ ARG IMAGE_NAME WORKDIR /workspace/tfs # Write metadata to model_info.txt file -RUN echo -e "$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt +RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and From afc70300ae0005485a1167e2add7fc4d588b2c19 Mon Sep 17 00:00:00 2001 From: Evan Date: Thu, 14 Mar 2024 13:53:55 -0700 Subject: [PATCH 04/15] Version and endpoint --- .github/workflows/e2e-preset-test.yml | 5 +++++ presets/models/supported_models.yaml | 27 +++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 06dd5ac3b..c22e7d6cc 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -274,6 +274,11 @@ jobs: if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz + + - name: Test version endpoint + if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') + run: | + curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version - name: Test inference endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 3f8dda09e..11c88cb0f 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -3,28 +3,29 @@ models: - name: llama-2-7b type: llama2-completion runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-7b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-13b type: llama2-completion runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-13b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-70b type: llama2-completion runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-70b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 # Tag history: + # 0.0.4 - Version endpoint # 0.0.3 - Inference API Cleanup (#233) # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244) # 0.0.1 - Initial Release @@ -34,23 +35,24 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 - name: falcon-7b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 - name: falcon-40b type: text-generation version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 - name: falcon-40b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Version endpoint # 0.0.2 - Inference API Cleanup (#233) # 0.0.1 - Initial Release @@ -59,13 +61,14 @@ models: type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 - name: mistral-7b-instruct type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Version endpoint # 0.0.2 - Inference API Cleanup (#233) # 0.0.1 - Initial Release From e85ee06d947bc82e4a1448fe2f9d0bad25006e4c Mon Sep 17 00:00:00 2001 From: Evan Date: Thu, 14 Mar 2024 14:10:42 -0700 Subject: [PATCH 05/15] Resync --- presets/models/falcon/model.go | 8 ++++---- presets/models/mistral/model.go | 12 ++++++------ presets/models/phi/model.go | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go index 00dfd0d77..7501dce23 100644 --- a/presets/models/falcon/model.go +++ b/presets/models/falcon/model.go @@ -37,10 +37,10 @@ var ( PresetFalcon40BInstructModel = PresetFalcon40BModel + "-instruct" PresetFalconTagMap = map[string]string{ - "Falcon7B": "0.0.2", - "Falcon7BInstruct": "0.0.2", - "Falcon40B": "0.0.2", - "Falcon40BInstruct": "0.0.2", + "Falcon7B": "0.0.3", + "Falcon7BInstruct": "0.0.3", + "Falcon40B": "0.0.3", + "Falcon40BInstruct": "0.0.3", } baseCommandPresetFalcon = "accelerate launch" diff --git a/presets/models/mistral/model.go b/presets/models/mistral/model.go index 3f1d79d15..7089eafb6 100644 --- a/presets/models/mistral/model.go +++ b/presets/models/mistral/model.go @@ -23,12 +23,12 @@ func init() { } var ( - PresetMistral7BModel = "mistral-7b" - PresetMistral7BInstructModel = PresetMistral7BModel + "-instruct" + PresetMistral7BModel = "mistral-7b" + PresetMistral7BInstructModel = PresetMistral7BModel + "-instruct" PresetMistralTagMap = map[string]string{ - "Mistral7B": "0.0.2", - "Mistral7BInstruct": "0.0.2", + "Mistral7B": "0.0.3", + "Mistral7BInstruct": "0.0.3", } baseCommandPresetMistral = "accelerate launch" @@ -46,7 +46,7 @@ func (*mistral7b) GetInferenceParameters() *model.PresetInferenceParam { return &model.PresetInferenceParam{ ModelFamilyName: "Mistral", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), - DiskStorageRequirement: "50Gi", + DiskStorageRequirement: "100Gi", GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement. @@ -70,7 +70,7 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetInferenceParam { return &model.PresetInferenceParam{ ModelFamilyName: "Mistral", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), - DiskStorageRequirement: "50Gi", + DiskStorageRequirement: "100Gi", GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement. diff --git a/presets/models/phi/model.go b/presets/models/phi/model.go index e819256b6..2e54dce38 100644 --- a/presets/models/phi/model.go +++ b/presets/models/phi/model.go @@ -19,10 +19,10 @@ func init() { } var ( - PresetPhi2Model = "phi-2" + PresetPhi2Model = "phi-2" PresetPhiTagMap = map[string]string{ - "Phi2": "0.0.1", + "Phi2": "0.0.2", } baseCommandPresetPhi = "accelerate launch" @@ -40,7 +40,7 @@ func (*phi2) GetInferenceParameters() *model.PresetInferenceParam { return &model.PresetInferenceParam{ ModelFamilyName: "Phi", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), - DiskStorageRequirement: "30Gi", + DiskStorageRequirement: "50Gi", GPUCountRequirement: "1", TotalGPUMemoryRequirement: "12Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. From f29877754e01a124110f0fc5617aec03a2df88cb Mon Sep 17 00:00:00 2001 From: Evan Date: Thu, 14 Mar 2024 14:12:48 -0700 Subject: [PATCH 06/15] Phi2 --- presets/models/supported_models.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 7d85d07c8..97c2a2697 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -79,7 +79,8 @@ models: type: text-generation version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Version endpoint # 0.0.2 - Update Default Params (#294) # 0.0.1 - Initial Release From 478f68b6fdc8a582e5a4edc505d2d7372e31edc3 Mon Sep 17 00:00:00 2001 From: Evan Date: Tue, 19 Mar 2024 20:21:11 -0700 Subject: [PATCH 07/15] Version --- presets/models/supported_models.yaml | 45 ++++++++++++++++------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 97c2a2697..6ac9480d7 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -3,27 +3,27 @@ models: - name: llama-2-7b type: llama2-completion runtime: llama-2 - tag: 0.0.4 + version: 0.0.4 - name: llama-2-7b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.4 + version: 0.0.4 - name: llama-2-13b type: llama2-completion runtime: llama-2 - tag: 0.0.4 + version: 0.0.4 - name: llama-2-13b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.4 + version: 0.0.4 - name: llama-2-70b type: llama2-completion runtime: llama-2 - tag: 0.0.4 + version: 0.0.4 - name: llama-2-70b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.4 + version: 0.0.4 # Tag history: # 0.0.4 - Version endpoint # 0.0.3 - Inference API Cleanup (#233) @@ -33,24 +33,28 @@ models: # Falcon - name: falcon-7b type: text-generation - version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 + revisionID: 898df1396f35e447d5fe44e0a3ccaaaa69f30d36 + commitID: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - tag: 0.0.4 + version: 0.0.4 - name: falcon-7b-instruct type: text-generation - version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 + revisionID: cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 + commitID: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - tag: 0.0.4 + version: 0.0.4 - name: falcon-40b type: text-generation - version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 + revisionID: 4a70170c215b36a3cce4b4253f6d0612bb7d4146 + commitURL: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - tag: 0.0.4 + version: 0.0.4 - name: falcon-40b-instruct type: text-generation - version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f + revisionID: ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f + commitURL: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - tag: 0.0.4 + version: 0.0.4 # Tag history: # 0.0.4 - Version endpoint # 0.0.3 - Update Default Params (#294) @@ -60,14 +64,16 @@ models: # Mistral - name: mistral-7b type: text-generation - version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 + revisionID: 26bca36bde8333b5d7f72e9ed20ccda6a618af24 + commitURL: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 runtime: tfs tag: 0.0.4 - name: mistral-7b-instruct type: text-generation - version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 + revisionID: b70aa86578567ba3301b21c8a27bea4e8f6d6d61 + commitURL: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 runtime: tfs - tag: 0.0.4 + version: 0.0.4 # Tag history: # 0.0.4 - Version endpoint # 0.0.3 - Update Default Params (#294) @@ -77,9 +83,10 @@ models: # Phi-2 - name: phi-2 type: text-generation - version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 + revisionID: b10c3eba545ad279e7208ee3a5d644566f001670 + commitURL: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 runtime: tfs - tag: 0.0.3 + version: 0.0.3 # Tag history: # 0.0.3 - Version endpoint # 0.0.2 - Update Default Params (#294) From 60bada85658716104ad7654cbc46cfcbd1df60a3 Mon Sep 17 00:00:00 2001 From: Evan Date: Wed, 27 Mar 2024 13:25:12 -0700 Subject: [PATCH 08/15] Versioning fixes --- docker/presets/llama-2/Dockerfile | 3 ++- docker/presets/tfs/Dockerfile | 2 +- presets/models/supported_models.yaml | 21 +++++++-------------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index e5854f805..c9e10fc40 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -3,6 +3,7 @@ # --build-arg WEIGHTS_PATH=/weights \ # --build-arg VERSION={{VERSION}} \ # --build-arg MODEL_TYPE={{MODEL_TYPE}} \ +# --build-arg IMAGE_NAME={{IMAGE_NAME}} \ FROM python:3.8-slim WORKDIR /workspace @@ -29,7 +30,7 @@ ARG VERSION ARG IMAGE_NAME # Write metadata to model_info.txt file -RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/llama/model_info.txt +RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index 4f04df4c3..c2c0a40f2 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -9,7 +9,7 @@ ARG IMAGE_NAME WORKDIR /workspace/tfs # Write metadata to model_info.txt file -RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt +RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 6ac9480d7..460f5210e 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -33,26 +33,22 @@ models: # Falcon - name: falcon-7b type: text-generation - revisionID: 898df1396f35e447d5fe44e0a3ccaaaa69f30d36 - commitID: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 + version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs version: 0.0.4 - name: falcon-7b-instruct type: text-generation - revisionID: cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 - commitID: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 + version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs version: 0.0.4 - name: falcon-40b type: text-generation - revisionID: 4a70170c215b36a3cce4b4253f6d0612bb7d4146 - commitURL: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 + version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs version: 0.0.4 - name: falcon-40b-instruct type: text-generation - revisionID: ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f - commitURL: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f + version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs version: 0.0.4 # Tag history: @@ -64,14 +60,12 @@ models: # Mistral - name: mistral-7b type: text-generation - revisionID: 26bca36bde8333b5d7f72e9ed20ccda6a618af24 - commitURL: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 + version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 runtime: tfs tag: 0.0.4 - name: mistral-7b-instruct type: text-generation - revisionID: b70aa86578567ba3301b21c8a27bea4e8f6d6d61 - commitURL: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 + version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 runtime: tfs version: 0.0.4 # Tag history: @@ -83,8 +77,7 @@ models: # Phi-2 - name: phi-2 type: text-generation - revisionID: b10c3eba545ad279e7208ee3a5d644566f001670 - commitURL: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 + version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 runtime: tfs version: 0.0.3 # Tag history: From 29f14e08a1035c3d83d35db8a97ac6b1b63a99b9 Mon Sep 17 00:00:00 2001 From: Evan Date: Sat, 30 Mar 2024 18:23:58 -0700 Subject: [PATCH 09/15] Adding MODEL_VERSION into .txt file --- .../kind-cluster/docker-job-template.yaml | 1 + docker/presets/llama-2/Dockerfile | 4 +++- docker/presets/tfs-onnx/Dockerfile | 3 ++- docker/presets/tfs/Dockerfile | 3 ++- .../llama2-completion/inference_api.py | 5 +++- presets/models/supported_models.yaml | 24 +++++++++---------- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/.github/workflows/kind-cluster/docker-job-template.yaml b/.github/workflows/kind-cluster/docker-job-template.yaml index 64d86da54..99954233f 100644 --- a/.github/workflows/kind-cluster/docker-job-template.yaml +++ b/.github/workflows/kind-cluster/docker-job-template.yaml @@ -44,6 +44,7 @@ spec: --build-arg VERSION={{VERSION}} \ --build-arg MODEL_TYPE={{MODEL_TYPE}} \ --build-arg IMAGE_NAME={{IMAGE_NAME}} \ + --build-arg MODEL_VERSION={{MODEL_VERSION}} \ -f $DOCKERFILE_PATH / docker push $ACR_NAME.azurecr.io/{{IMAGE_NAME}}:$VERSION env: diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index c9e10fc40..7c44a83d4 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -4,6 +4,7 @@ # --build-arg VERSION={{VERSION}} \ # --build-arg MODEL_TYPE={{MODEL_TYPE}} \ # --build-arg IMAGE_NAME={{IMAGE_NAME}} \ +# --build-arg MODEL_VERSION={{MODEL_VERSION}} \ FROM python:3.8-slim WORKDIR /workspace @@ -28,9 +29,10 @@ ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION ARG IMAGE_NAME +ARG MODEL_VERSION # Write metadata to model_info.txt file -RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/llama/model_info.txt +RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/llama/model_info.txt ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index a13dabe28..27463d1a5 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -5,12 +5,13 @@ ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION ARG IMAGE_NAME +ARG MODEL_VERSION # Set the working directory WORKDIR /workspace/tfs # Write metadata to model_info.txt file -RUN echo -e "$VERSION: $IMAGE_NAME" > /workspace/tfs/model_info.txt +RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt # First, copy just the requirements.txt file and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index c2c0a40f2..39aebe1bf 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -4,12 +4,13 @@ ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION ARG IMAGE_NAME +ARG MODEL_VERSION # Set the working directory WORKDIR /workspace/tfs # Write metadata to model_info.txt file -RUN echo -e "$MODEL_TYPE\n$VERSION\n$IMAGE_NAME" > /workspace/tfs/model_info.txt +RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py index 612b4e789..4f70fbda1 100644 --- a/presets/inference/llama2-completion/inference_api.py +++ b/presets/inference/llama2-completion/inference_api.py @@ -8,6 +8,7 @@ import signal import sys import threading +import json from typing import Optional import GPUtil @@ -188,7 +189,9 @@ def get_version(): with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: model_name = f.read() - return {"version": model_name} + # Convert readable text file to json + data = json.dumps(dict(map(str.strip, line.split(':'))) for line in model_name.split('\n')) + return data def setup_worker_routes(): @app_worker.get("/healthz") diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 460f5210e..97c2a2697 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -3,27 +3,27 @@ models: - name: llama-2-7b type: llama2-completion runtime: llama-2 - version: 0.0.4 + tag: 0.0.4 - name: llama-2-7b-chat type: llama2-chat runtime: llama-2 - version: 0.0.4 + tag: 0.0.4 - name: llama-2-13b type: llama2-completion runtime: llama-2 - version: 0.0.4 + tag: 0.0.4 - name: llama-2-13b-chat type: llama2-chat runtime: llama-2 - version: 0.0.4 + tag: 0.0.4 - name: llama-2-70b type: llama2-completion runtime: llama-2 - version: 0.0.4 + tag: 0.0.4 - name: llama-2-70b-chat type: llama2-chat runtime: llama-2 - version: 0.0.4 + tag: 0.0.4 # Tag history: # 0.0.4 - Version endpoint # 0.0.3 - Inference API Cleanup (#233) @@ -35,22 +35,22 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - version: 0.0.4 + tag: 0.0.4 - name: falcon-7b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - version: 0.0.4 + tag: 0.0.4 - name: falcon-40b type: text-generation version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - version: 0.0.4 + tag: 0.0.4 - name: falcon-40b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - version: 0.0.4 + tag: 0.0.4 # Tag history: # 0.0.4 - Version endpoint # 0.0.3 - Update Default Params (#294) @@ -67,7 +67,7 @@ models: type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 runtime: tfs - version: 0.0.4 + tag: 0.0.4 # Tag history: # 0.0.4 - Version endpoint # 0.0.3 - Update Default Params (#294) @@ -79,7 +79,7 @@ models: type: text-generation version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 runtime: tfs - version: 0.0.3 + tag: 0.0.3 # Tag history: # 0.0.3 - Version endpoint # 0.0.2 - Update Default Params (#294) From fcc19a13eb44e6b12a38de9d35e28282b2d7da94 Mon Sep 17 00:00:00 2001 From: Evan Date: Tue, 2 Apr 2024 18:57:48 -0700 Subject: [PATCH 10/15] MODEL VERSION HASH --- docker/presets/llama-2/Dockerfile | 5 +++-- docker/presets/tfs-onnx/Dockerfile | 5 +++-- docker/presets/tfs/Dockerfile | 5 +++-- presets/inference/llama2-chat/inference_api.py | 7 ++++--- presets/inference/llama2-completion/inference_api.py | 8 +++----- presets/inference/text-generation/inference_api.py | 7 ++++--- 6 files changed, 20 insertions(+), 17 deletions(-) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 7c44a83d4..4fbd3e7e4 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -31,8 +31,9 @@ ARG VERSION ARG IMAGE_NAME ARG MODEL_VERSION -# Write metadata to model_info.txt file -RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/llama/model_info.txt +# Write metadata to model_info.json file +RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \ + echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/llama/model_info.json ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index 27463d1a5..cd23ac54b 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -10,8 +10,9 @@ ARG MODEL_VERSION # Set the working directory WORKDIR /workspace/tfs -# Write metadata to model_info.txt file -RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt +# Write metadata to model_info.json file +RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \ + echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json # First, copy just the requirements.txt file and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index 39aebe1bf..13b9768ab 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -9,8 +9,9 @@ ARG MODEL_VERSION # Set the working directory WORKDIR /workspace/tfs -# Write metadata to model_info.txt file -RUN echo -e "Model Type: $MODEL_TYPE\nVersion: $VERSION\nImage Name: $IMAGE_NAME\nModel Version URL: $MODEL_VERSION" > /workspace/tfs/model_info.txt +# Write metadata to model_info.json file +RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \ + echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py index b3a7a909c..a91786e53 100644 --- a/presets/inference/llama2-chat/inference_api.py +++ b/presets/inference/llama2-chat/inference_api.py @@ -8,6 +8,7 @@ import signal import sys import threading +import json from typing import Optional import GPUtil @@ -19,7 +20,7 @@ from pydantic import BaseModel # Constants -MODEL_INFO = "model_info.txt" +MODEL_INFO = "model_info.json" # Setup argparse parser = argparse.ArgumentParser(description="Llama API server.") @@ -197,9 +198,9 @@ def get_metrics(): @app_main.get("/version") def get_version(): with open(f"/workspace/llama/{MODEL_INFO}", "r") as f: - model_name = f.read() + model_info = json.load(f) - return {"version": model_name} + return model_info def setup_worker_routes(): @app_worker.get("/healthz") diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py index 4f70fbda1..f29ba91e1 100644 --- a/presets/inference/llama2-completion/inference_api.py +++ b/presets/inference/llama2-completion/inference_api.py @@ -20,7 +20,7 @@ from pydantic import BaseModel # Constants -MODEL_INFO = "model_info.txt" +MODEL_INFO = "model_info.json" # Setup argparse parser = argparse.ArgumentParser(description="Llama API server.") @@ -187,11 +187,9 @@ def get_metrics(): @app_main.get("/version") def get_version(): with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: - model_name = f.read() + model_info = json.load(f) - # Convert readable text file to json - data = json.dumps(dict(map(str.strip, line.split(':'))) for line in model_name.split('\n')) - return data + return model_info def setup_worker_routes(): @app_worker.get("/healthz") diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index 06032e89a..8d92d53f3 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -3,6 +3,7 @@ import os from dataclasses import asdict, dataclass, field from typing import Any, Dict, List, Optional +import json import GPUtil import torch @@ -14,7 +15,7 @@ GenerationConfig, HfArgumentParser) # Constants -MODEL_INFO = "model_info.txt" +MODEL_INFO = "model_info.json" @dataclass class ModelConfig: @@ -215,9 +216,9 @@ def get_metrics(): @app.get("/version") def get_version(): with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: - model_name = f.read() + model_info = json.load(f) - return {"version": model_name} + return model_info if __name__ == "__main__": local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set From 7582ee52e8a3e1af6a3bf6ff5cb68c8b7a07cf03 Mon Sep 17 00:00:00 2001 From: Evan Date: Thu, 4 Apr 2024 15:09:02 -0700 Subject: [PATCH 11/15] Get Hash --- docker/presets/llama-2/Dockerfile | 2 +- docker/presets/tfs-onnx/Dockerfile | 2 +- docker/presets/tfs/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 4fbd3e7e4..19a869904 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -32,7 +32,7 @@ ARG IMAGE_NAME ARG MODEL_VERSION # Write metadata to model_info.json file -RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \ +RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \ echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/llama/model_info.json ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index cd23ac54b..8fdfc7440 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -11,7 +11,7 @@ ARG MODEL_VERSION WORKDIR /workspace/tfs # Write metadata to model_info.json file -RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \ +RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \ echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json # First, copy just the requirements.txt file and install dependencies diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index 13b9768ab..863e40728 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -10,7 +10,7 @@ ARG MODEL_VERSION WORKDIR /workspace/tfs # Write metadata to model_info.json file -RUN MODEL_VERSION_HASH=$(echo "$MODEL_VERSION" | awk -F'/' '{print $NF}') && \ +RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \ echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json # First, copy just the preset files and install dependencies From 9db1a0428bdef0990ef50bdecdcb84fee60679fe Mon Sep 17 00:00:00 2001 From: Evan Date: Thu, 4 Apr 2024 15:27:27 -0700 Subject: [PATCH 12/15] Version comments --- presets/models/supported_models.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 900c4691b..ae57ff75d 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -25,7 +25,7 @@ models: runtime: llama-2 tag: 0.0.4 # Tag history: - # 0.0.4 - Version endpoint + # 0.0.4 - Version endpoint (#297) # 0.0.3 - Inference API Cleanup (#233) # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244) # 0.0.1 - Initial Release @@ -42,7 +42,7 @@ models: runtime: tfs tag: 0.0.5 # Tag history: - # 0.0.5 - Version endpoint + # 0.0.5 - Version endpoint (#297) # 0.0.4 - Adjust default model params (#310) # 0.0.3 - Update Default Params (#294) # 0.0.2 - Inference API Cleanup (#233) @@ -59,7 +59,7 @@ models: runtime: tfs tag: 0.0.6 # Tag history for 40b models: - # 0.0.6 - Version endpoint + # 0.0.6 - Version endpoint (#297) # 0.0.5 - Adjust default model params (#310) # 0.0.4 - Skipped due to incomplete upload issue # 0.0.3 - Update Default Params (#294) @@ -78,7 +78,7 @@ models: runtime: tfs tag: 0.0.5 # Tag history: - # 0.0.5 - Version endpoint + # 0.0.5 - Version endpoint (#297) # 0.0.4 - Adjust default model params (#310) # 0.0.3 - Update Default Params (#294) # 0.0.2 - Inference API Cleanup (#233) @@ -91,7 +91,7 @@ models: runtime: tfs tag: 0.0.4 # Tag history: - # 0.0.4 - Version endpoint + # 0.0.4 - Version endpoint (#297) # 0.0.3 - Adjust default model params (#310) # 0.0.2 - Update Default Params (#294) # 0.0.1 - Initial Release From bf015c079289713f9b35e66b3c3f0fcb9f3d7c02 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 4 Apr 2024 17:15:27 -0700 Subject: [PATCH 13/15] fix: Checkout Evans awesome fork --- .github/workflows/e2e-preset-test.yml | 6 +----- .github/workflows/kind-cluster/determine_models.py | 14 +++++++++++--- .github/workflows/preset-image-build.yml | 1 + 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index c22e7d6cc..3c2b1365d 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -48,6 +48,7 @@ jobs: run: | PR_BRANCH=${{ env.BRANCH_NAME }} \ FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ + PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \ python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models @@ -274,11 +275,6 @@ jobs: if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz - - - name: Test version endpoint - if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') - run: | - curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version - name: Test inference endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py index 5ace3ba63..18b5773e1 100644 --- a/.github/workflows/kind-cluster/determine_models.py +++ b/.github/workflows/kind-cluster/determine_models.py @@ -90,7 +90,7 @@ def models_to_build(files_changed): seen_model_types.add(model_info["type"]) return list(models) -def check_modified_models(pr_branch): +def check_modified_models(pr_branch, pr_repo_url): """Check for modified models in the repository.""" repo_dir = Path.cwd() / "repo" @@ -102,7 +102,14 @@ def check_modified_models(pr_branch): run_command("git checkout --detach") run_command("git fetch origin main:main") - run_command(f"git fetch origin {pr_branch}:{pr_branch}") + + fetch_command = f"git fetch origin {pr_branch}:{pr_branch}" + if pr_repo_url != KAITO_REPO_URL: + # Add the PR's repo as a new remote only if it's different from the main repo + run_command("git remote add pr_repo {}".format(pr_repo_url)) + fetch_command = f"git fetch pr_repo {pr_branch}" + + run_command(fetch_command) run_command(f"git checkout {pr_branch}") files = run_command("git diff --name-only origin/main") # Returns each file on newline @@ -118,6 +125,7 @@ def check_modified_models(pr_branch): def main(): pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main' force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False + pr_repo_url = os.environ.get("PR_REPO_URL", KAITO_REPO_URL) affected_models = [] if force_run_all != "false": @@ -125,7 +133,7 @@ def main(): else: # Logic to determine affected models # Example: affected_models = ['model1', 'model2', 'model3'] - affected_models = check_modified_models(pr_branch) + affected_models = check_modified_models(pr_branch, pr_repo_url) # Convert the list of models into JSON matrix format matrix = create_matrix(affected_models) diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 5cdb8f98e..a5f100560 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -55,6 +55,7 @@ jobs: run: | PR_BRANCH=${{ env.BRANCH_NAME }} \ FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ + PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \ python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models From 6b88a937f2e56ccbde416a70ea3e59ef166febf6 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 4 Apr 2024 17:23:18 -0700 Subject: [PATCH 14/15] fix: Checkout Evans awesome fork --- .github/workflows/e2e-preset-test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 3c2b1365d..f89d1660c 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -275,6 +275,11 @@ jobs: if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz + + - name: Test version endpoint + if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') + run: | + curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version - name: Test inference endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') From 81ce9c9c87d8bbfb12fc6c702e95f6b7b1426e1f Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 4 Apr 2024 18:04:02 -0700 Subject: [PATCH 15/15] feat: Document version endpoint --- .../inference/text-generation/api_spec.json | 1199 +++++++++-------- .../text-generation/inference_api.py | 77 +- 2 files changed, 699 insertions(+), 577 deletions(-) diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json index 480fa97e4..8cdb9c16d 100644 --- a/presets/inference/text-generation/api_spec.json +++ b/presets/inference/text-generation/api_spec.json @@ -1,599 +1,658 @@ { "openapi": "3.1.0", "info": { - "title": "FastAPI", - "version": "0.1.0" + "title": "FastAPI", + "version": "0.1.0" }, "paths": { - "/": { - "get": { - "summary": "Home Endpoint", - "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.", - "operationId": "home__get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HomeResponse" - } - } - } - } + "/": { + "get": { + "summary": "Home Endpoint", + "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.", + "operationId": "home__get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HomeResponse" + } } + } } - }, - "/healthz": { - "get": { - "summary": "Health Check Endpoint", - "operationId": "health_check_healthz_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HealthStatus" - }, - "example": { - "status": "Healthy" - } - } - } - }, - "500": { - "description": "Error Response", - "content": { - "application/json": { - "examples": { - "model_uninitialized": { - "summary": "Model not initialized", - "value": { - "detail": "Model not initialized" - } - }, - "pipeline_uninitialized": { - "summary": "Pipeline not initialized", - "value": { - "detail": "Pipeline not initialized" - } - } - } - } - } + } + } + }, + "/healthz": { + "get": { + "summary": "Health Check Endpoint", + "operationId": "health_check_healthz_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HealthStatus" + }, + "example": { + "status": "Healthy" + } + } + } + }, + "500": { + "description": "Error Response", + "content": { + "application/json": { + "examples": { + "model_uninitialized": { + "summary": "Model not initialized", + "value": { + "detail": "Model not initialized" + } + }, + "pipeline_uninitialized": { + "summary": "Pipeline not initialized", + "value": { + "detail": "Pipeline not initialized" + } } + } } + } } - }, - "/chat": { - "post": { - "summary": "Chat Endpoint", - "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.", - "operationId": "generate_text_chat_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnifiedRequestModel" - }, - "examples": { - "text_generation_example": { - "summary": "Text Generation Example", - "description": "An example of a text generation request.", - "value": { - "prompt": "Tell me a joke", - "return_full_text": true, - "clean_up_tokenization_spaces": false, - "generate_kwargs": { - "max_length": 200, - "min_length": 0, - "do_sample": true, - "early_stopping": false, - "num_beams": 1, - "temperature": 1, - "top_k": 10, - "top_p": 1, - "typical_p": 1, - "repetition_penalty": 1, - "eos_token_id": 11 - } - } - }, - "conversation_example": { - "summary": "Conversation Example", - "description": "An example of a conversational request.", - "value": { - "messages": [ - { - "role": "user", - "content": "What is your favourite condiment?" - }, - { - "role": "assistant", - "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!" - }, - { - "role": "user", - "content": "Do you have mayonnaise recipes?" - } - ], - "return_full_text": true, - "clean_up_tokenization_spaces": false, - "generate_kwargs": { - "max_length": 200, - "min_length": 0, - "do_sample": true, - "early_stopping": false, - "num_beams": 1, - "temperature": 1, - "top_k": 10, - "top_p": 1, - "typical_p": 1, - "repetition_penalty": 1, - "eos_token_id": 11 - } - } - } - } - } - }, - "required": true + } + } + }, + "/chat": { + "post": { + "summary": "Chat Endpoint", + "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.", + "operationId": "generate_text_chat_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UnifiedRequestModel" }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {}, - "examples": { - "text_generation": { - "summary": "Text Generation Response", - "value": { - "Result": "Generated text based on the prompt." - } - }, - "conversation": { - "summary": "Conversation Response", - "value": { - "Result": "Response to the last message in the conversation." - } - } - } - } - } - }, - "400": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "examples": { - "missing_prompt": { - "summary": "Missing Prompt", - "value": { - "detail": "Text generation parameter prompt required" - } - }, - "missing_messages": { - "summary": "Missing Messages", - "value": { - "detail": "Conversational parameter messages required" - } - } - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } + "examples": { + "text_generation_example": { + "summary": "Text Generation Example", + "description": "An example of a text generation request.", + "value": { + "prompt": "Tell me a joke", + "return_full_text": true, + "clean_up_tokenization_spaces": false, + "generate_kwargs": { + "max_length": 200, + "min_length": 0, + "do_sample": true, + "early_stopping": false, + "num_beams": 1, + "temperature": 1, + "top_k": 10, + "top_p": 1, + "typical_p": 1, + "repetition_penalty": 1, + "eos_token_id": 11 + } + } + }, + "conversation_example": { + "summary": "Conversation Example", + "description": "An example of a conversational request.", + "value": { + "messages": [ + { + "role": "user", + "content": "What is your favourite condiment?" + }, + { + "role": "assistant", + "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!" + }, + { + "role": "user", + "content": "Do you have mayonnaise recipes?" } + ], + "return_full_text": true, + "clean_up_tokenization_spaces": false, + "generate_kwargs": { + "max_length": 200, + "min_length": 0, + "do_sample": true, + "early_stopping": false, + "num_beams": 1, + "temperature": 1, + "top_k": 10, + "top_p": 1, + "typical_p": 1, + "repetition_penalty": 1, + "eos_token_id": 11 + } + } + } + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {}, + "examples": { + "text_generation": { + "summary": "Text Generation Response", + "value": { + "Result": "Generated text based on the prompt." + } + }, + "conversation": { + "summary": "Conversation Response", + "value": { + "Result": "Response to the last message in the conversation." + } + } + } + } + } + }, + "400": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "examples": { + "missing_prompt": { + "summary": "Missing Prompt", + "value": { + "detail": "Text generation parameter prompt required" + } + }, + "missing_messages": { + "summary": "Missing Messages", + "value": { + "detail": "Conversational parameter messages required" + } } + } } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } } - }, - "/metrics": { - "get": { - "summary": "Metrics Endpoint", - "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.", - "operationId": "get_metrics_metrics_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/MetricsResponse" - }, - "examples": { - "gpu_metrics": { - "summary": "Example when GPUs are available", - "value": { - "gpu_info": [ - { - "id": "GPU-1234", - "name": "GeForce GTX 950", - "load": "25.00%", - "temperature": "55 C", - "memory": { - "used": "1.00 GB", - "total": "2.00 GB" - } - } - ] - } - }, - "cpu_metrics": { - "summary": "Example when only CPU is available", - "value": { - "cpu_info": { - "load_percentage": 20, - "physical_cores": 4, - "total_cores": 8, - "memory": { - "used": "4.00 GB", - "total": "16.00 GB" - } - } - } - } - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } + } + } + }, + "/metrics": { + "get": { + "summary": "Metrics Endpoint", + "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.", + "operationId": "get_metrics_metrics_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MetricsResponse" + }, + "examples": { + "gpu_metrics": { + "summary": "Example when GPUs are available", + "value": { + "gpu_info": [ + { + "id": "GPU-1234", + "name": "GeForce GTX 950", + "load": "25.00%", + "temperature": "55 C", + "memory": { + "used": "1.00 GB", + "total": "2.00 GB" } + } + ] + } + }, + "cpu_metrics": { + "summary": "Example when only CPU is available", + "value": { + "cpu_info": { + "load_percentage": 20, + "physical_cores": 4, + "total_cores": 8, + "memory": { + "used": "4.00 GB", + "total": "16.00 GB" + } } + } } + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } } + } } + } } - }, - "components": { - "schemas": { - "CPUInfo": { - "properties": { - "load_percentage": { - "type": "number", - "title": "Load Percentage" - }, - "physical_cores": { - "type": "integer", - "title": "Physical Cores" - }, - "total_cores": { - "type": "integer", - "title": "Total Cores" - }, - "memory": { - "$ref": "#/components/schemas/MemoryInfo" + }, + "/version": { + "get": { + "summary": "Get Model Information", + "description": "Reads and returns model version information from a predefined JSON file.", + "operationId": "get_version_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {}, + "examples": { + "model_info": { + "summary": "Model Information Response", + "value": { + "Model Type": "Your Model Type", + "Version": "1.0.0", + "Image Name": "model_image_name", + "Model Version URL": "http://example.com/model/version", + "REVISION_ID": "revision_hash" + } } - }, - "type": "object", - "required": [ - "load_percentage", - "physical_cores", - "total_cores", - "memory" - ], - "title": "CPUInfo" - }, - "ErrorResponse": { - "properties": { - "detail": { - "type": "string", - "title": "Detail" - } - }, - "type": "object", - "required": [ - "detail" - ], - "title": "ErrorResponse" - }, - "GPUInfo": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "name": { - "type": "string", - "title": "Name" - }, - "load": { - "type": "string", - "title": "Load" - }, - "temperature": { - "type": "string", - "title": "Temperature" - }, - "memory": { - "$ref": "#/components/schemas/MemoryInfo" - } - }, - "type": "object", - "required": [ - "id", - "name", - "load", - "temperature", - "memory" - ], - "title": "GPUInfo" - }, - "GenerateKwargs": { - "properties": { - "max_length": { - "type": "integer", - "title": "Max Length", - "default": 200 - }, - "min_length": { - "type": "integer", - "title": "Min Length", - "default": 0 - }, - "do_sample": { - "type": "boolean", - "title": "Do Sample", - "default": true - }, - "early_stopping": { - "type": "boolean", - "title": "Early Stopping", - "default": false - }, - "num_beams": { - "type": "integer", - "title": "Num Beams", - "default": 1 - }, - "temperature": { - "type": "number", - "title": "Temperature", - "default": 1 - }, - "top_k": { - "type": "integer", - "title": "Top K", - "default": 10 - }, - "top_p": { - "type": "number", - "title": "Top P", - "default": 1 - }, - "typical_p": { - "type": "number", - "title": "Typical P", - "default": 1 - }, - "repetition_penalty": { - "type": "number", - "title": "Repetition Penalty", - "default": 1 - }, - "pad_token_id": { - "type": "integer", - "title": "Pad Token Id" - }, - "eos_token_id": { - "type": "integer", - "title": "Eos Token Id", - "default": 11 - } - }, - "type": "object", - "title": "GenerateKwargs", - "example": { - "max_length": 200, - "temperature": 0.7, - "top_p": 0.9, - "additional_param": "Example value" + } } + } }, - "HTTPValidationError": { - "properties": { - "detail": { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array", - "title": "Detail" - } - }, - "type": "object", - "title": "HTTPValidationError" - }, - "HealthStatus": { - "properties": { - "status": { - "type": "string", - "title": "Status", - "example": "Healthy" - } - }, - "type": "object", - "required": [ - "status" - ], - "title": "HealthStatus" - }, - "HomeResponse": { - "properties": { - "message": { - "type": "string", - "title": "Message", - "example": "Server is running" - } - }, - "type": "object", - "required": [ - "message" - ], - "title": "HomeResponse" - }, - "MemoryInfo": { - "properties": { - "used": { - "type": "string", - "title": "Used" - }, - "total": { - "type": "string", - "title": "Total" + "404": { + "description": "Model Info Not Found", + "content": { + "application/json": { + "examples": { + "file_not_found": { + "summary": "Model Info File Not Found", + "value": { + "detail": "/workspace/tfs/model_info.json file not found." + } } - }, - "type": "object", - "required": [ - "used", - "total" - ], - "title": "MemoryInfo" - }, - "Message": { - "properties": { - "role": { - "type": "string", - "title": "Role" - }, - "content": { - "type": "string", - "title": "Content" - } - }, - "type": "object", - "required": [ - "role", - "content" - ], - "title": "Message" - }, - "MetricsResponse": { - "properties": { - "gpu_info": { - "items": { - "$ref": "#/components/schemas/GPUInfo" - }, - "type": "array", - "title": "Gpu Info" - }, - "cpu_info": { - "$ref": "#/components/schemas/CPUInfo" - } - }, - "type": "object", - "title": "MetricsResponse" - }, - "UnifiedRequestModel": { - "properties": { - "prompt": { - "type": "string", - "title": "Prompt", - "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'." - }, - "return_full_text": { - "type": "boolean", - "title": "Return Full Text", - "description": "Return full text if True, else only added text", - "default": true - }, - "clean_up_tokenization_spaces": { - "type": "boolean", - "title": "Clean Up Tokenization Spaces", - "description": "Clean up extra spaces in text output", - "default": false - }, - "prefix": { - "type": "string", - "title": "Prefix", - "description": "Prefix added to prompt" - }, - "handle_long_generation": { - "type": "string", - "title": "Handle Long Generation", - "description": "Strategy to handle long generation" - }, - "generate_kwargs": { - "allOf": [ - { - "$ref": "#/components/schemas/GenerateKwargs" - } - ], - "title": "Generate Kwargs", - "description": "Additional kwargs for generate method" - }, - "messages": { - "items": { - "$ref": "#/components/schemas/Message" - }, - "type": "array", - "title": "Messages", - "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'." - } - }, - "type": "object", - "title": "UnifiedRequestModel" - }, - "ValidationError": { - "properties": { - "loc": { - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] - }, - "type": "array", - "title": "Location" - }, - "msg": { - "type": "string", - "title": "Message" - }, - "type": { - "type": "string", - "title": "Error Type" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "examples": { + "unexpected_error": { + "summary": "Unexpected Error", + "value": { + "detail": "An unexpected error occurred on the server." + } } - }, - "type": "object", - "required": [ - "loc", - "msg", - "type" - ], - "title": "ValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "CPUInfo": { + "properties": { + "load_percentage": { + "type": "number", + "title": "Load Percentage" + }, + "physical_cores": { + "type": "integer", + "title": "Physical Cores" + }, + "total_cores": { + "type": "integer", + "title": "Total Cores" + }, + "memory": { + "$ref": "#/components/schemas/MemoryInfo" + } + }, + "type": "object", + "required": [ + "load_percentage", + "physical_cores", + "total_cores", + "memory" + ], + "title": "CPUInfo" + }, + "ErrorResponse": { + "properties": { + "detail": { + "type": "string", + "title": "Detail" + } + }, + "type": "object", + "required": [ + "detail" + ], + "title": "ErrorResponse" + }, + "GPUInfo": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "name": { + "type": "string", + "title": "Name" + }, + "load": { + "type": "string", + "title": "Load" + }, + "temperature": { + "type": "string", + "title": "Temperature" + }, + "memory": { + "$ref": "#/components/schemas/MemoryInfo" + } + }, + "type": "object", + "required": [ + "id", + "name", + "load", + "temperature", + "memory" + ], + "title": "GPUInfo" + }, + "GenerateKwargs": { + "properties": { + "max_length": { + "type": "integer", + "title": "Max Length", + "default": 200 + }, + "min_length": { + "type": "integer", + "title": "Min Length", + "default": 0 + }, + "do_sample": { + "type": "boolean", + "title": "Do Sample", + "default": true + }, + "early_stopping": { + "type": "boolean", + "title": "Early Stopping", + "default": false + }, + "num_beams": { + "type": "integer", + "title": "Num Beams", + "default": 1 + }, + "temperature": { + "type": "number", + "title": "Temperature", + "default": 1 + }, + "top_k": { + "type": "integer", + "title": "Top K", + "default": 10 + }, + "top_p": { + "type": "number", + "title": "Top P", + "default": 1 + }, + "typical_p": { + "type": "number", + "title": "Typical P", + "default": 1 + }, + "repetition_penalty": { + "type": "number", + "title": "Repetition Penalty", + "default": 1 + }, + "pad_token_id": { + "type": "integer", + "title": "Pad Token Id" + }, + "eos_token_id": { + "type": "integer", + "title": "Eos Token Id", + "default": 11 + } + }, + "type": "object", + "title": "GenerateKwargs", + "example": { + "max_length": 200, + "temperature": 0.7, + "top_p": 0.9, + "additional_param": "Example value" + } + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "HealthStatus": { + "properties": { + "status": { + "type": "string", + "title": "Status", + "example": "Healthy" + } + }, + "type": "object", + "required": [ + "status" + ], + "title": "HealthStatus" + }, + "HomeResponse": { + "properties": { + "message": { + "type": "string", + "title": "Message", + "example": "Server is running" + } + }, + "type": "object", + "required": [ + "message" + ], + "title": "HomeResponse" + }, + "MemoryInfo": { + "properties": { + "used": { + "type": "string", + "title": "Used" + }, + "total": { + "type": "string", + "title": "Total" + } + }, + "type": "object", + "required": [ + "used", + "total" + ], + "title": "MemoryInfo" + }, + "Message": { + "properties": { + "role": { + "type": "string", + "title": "Role" + }, + "content": { + "type": "string", + "title": "Content" + } + }, + "type": "object", + "required": [ + "role", + "content" + ], + "title": "Message" + }, + "MetricsResponse": { + "properties": { + "gpu_info": { + "items": { + "$ref": "#/components/schemas/GPUInfo" + }, + "type": "array", + "title": "Gpu Info" + }, + "cpu_info": { + "$ref": "#/components/schemas/CPUInfo" + } + }, + "type": "object", + "title": "MetricsResponse" + }, + "UnifiedRequestModel": { + "properties": { + "prompt": { + "type": "string", + "title": "Prompt", + "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'." + }, + "return_full_text": { + "type": "boolean", + "title": "Return Full Text", + "description": "Return full text if True, else only added text", + "default": true + }, + "clean_up_tokenization_spaces": { + "type": "boolean", + "title": "Clean Up Tokenization Spaces", + "description": "Clean up extra spaces in text output", + "default": false + }, + "prefix": { + "type": "string", + "title": "Prefix", + "description": "Prefix added to prompt" + }, + "handle_long_generation": { + "type": "string", + "title": "Handle Long Generation", + "description": "Strategy to handle long generation" + }, + "generate_kwargs": { + "allOf": [ + { + "$ref": "#/components/schemas/GenerateKwargs" + } + ], + "title": "Generate Kwargs", + "description": "Additional kwargs for generate method" + }, + "messages": { + "items": { + "$ref": "#/components/schemas/Message" + }, + "type": "array", + "title": "Messages", + "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'." + } + }, + "type": "object", + "title": "UnifiedRequestModel" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" } + } } -} \ No newline at end of file + } \ No newline at end of file diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index 8353381f9..23dccba5d 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -1,8 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import json import os from dataclasses import asdict, dataclass, field -import json from typing import Annotated, Any, Dict, List, Optional import GPUtil @@ -17,7 +17,9 @@ GenerationConfig, HfArgumentParser) # Constants -MODEL_INFO = "model_info.json" +APP_DIR = "/workspace/tfs" +WEIGHTS_DIR = f"{APP_DIR}/weights" +MODEL_INFO_FILE = f"{APP_DIR}/model_info.json" @dataclass class ModelConfig: @@ -25,7 +27,7 @@ class ModelConfig: Transformers Model Configuration Parameters """ pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"}) - pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) + pretrained_model_name_or_path: Optional[str] = field(default=WEIGHTS_DIR, metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"}) cache_dir: Optional[str] = field(default=None, metadata={"help": "Cache directory for the model"}) from_tf: bool = field(default=False, metadata={"help": "Load model from a TensorFlow checkpoint"}) @@ -431,11 +433,72 @@ def get_metrics(): except Exception as e: raise HTTPException(status_code=500, detail=str(e)) -@app.get("/version") +@app.get( + "/version", + summary="Get Model Information", + response_description="Model Version Information", + responses={ + 200: { + "description": "Successful Response", + "content": { + "application/json": { + "examples": { + "model_info": { + "summary": "Model Information Response", + "value": { + "Model Type": "Your Model Type", + "Version": "1.0.0", + "Image Name": "model_image_name", + "Model Version URL": "http://example.com/model/version", + "REVISION_ID": "revision_hash" + } + } + } + } + } + }, + 404: { + "description": "Model Info Not Found", + "content": { + "application/json": { + "examples": { + "file_not_found": { + "summary": "Model Info File Not Found", + "value": {"detail": f"{MODEL_INFO_FILE} file not found."} + } + } + } + } + }, + 500: { + "description": "Internal Server Error", + "content": { + "application/json": { + "examples": { + "unexpected_error": { + "summary": "Unexpected Error", + "value": { + "detail": "An unexpected error occurred on the server." + } + } + } + } + } + } + } +) def get_version(): - with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: - model_info = json.load(f) - + """ + Reads and returns model version information from a predefined JSON file. + """ + try: + with open(MODEL_INFO_FILE, "r") as f: + model_info = json.load(f) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="model_info.json file not found.") + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + return model_info if __name__ == "__main__":