diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 06dd5ac3b..f89d1660c 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -48,6 +48,7 @@ jobs: run: | PR_BRANCH=${{ env.BRANCH_NAME }} \ FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ + PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \ python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models @@ -274,6 +275,11 @@ jobs: if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz + + - name: Test version endpoint + if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') + run: | + curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version - name: Test inference endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py index 5ace3ba63..18b5773e1 100644 --- a/.github/workflows/kind-cluster/determine_models.py +++ b/.github/workflows/kind-cluster/determine_models.py @@ -90,7 +90,7 @@ def models_to_build(files_changed): seen_model_types.add(model_info["type"]) return list(models) -def check_modified_models(pr_branch): +def check_modified_models(pr_branch, pr_repo_url): """Check for modified models in the repository.""" repo_dir = Path.cwd() / "repo" @@ -102,7 +102,14 @@ def check_modified_models(pr_branch): run_command("git checkout --detach") run_command("git fetch origin main:main") - run_command(f"git fetch origin {pr_branch}:{pr_branch}") + + fetch_command = f"git fetch origin {pr_branch}:{pr_branch}" + if pr_repo_url != KAITO_REPO_URL: + # Add the PR's repo as a new remote only if it's different from the main repo + run_command("git remote add pr_repo {}".format(pr_repo_url)) + fetch_command = f"git fetch pr_repo {pr_branch}" + + run_command(fetch_command) run_command(f"git checkout {pr_branch}") files = run_command("git diff --name-only origin/main") # Returns each file on newline @@ -118,6 +125,7 @@ def check_modified_models(pr_branch): def main(): pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main' force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False + pr_repo_url = os.environ.get("PR_REPO_URL", KAITO_REPO_URL) affected_models = [] if force_run_all != "false": @@ -125,7 +133,7 @@ def main(): else: # Logic to determine affected models # Example: affected_models = ['model1', 'model2', 'model3'] - affected_models = check_modified_models(pr_branch) + affected_models = check_modified_models(pr_branch, pr_repo_url) # Convert the list of models into JSON matrix format matrix = create_matrix(affected_models) diff --git a/.github/workflows/kind-cluster/docker-job-template.yaml b/.github/workflows/kind-cluster/docker-job-template.yaml index a19860f88..99954233f 100644 --- a/.github/workflows/kind-cluster/docker-job-template.yaml +++ b/.github/workflows/kind-cluster/docker-job-template.yaml @@ -43,6 +43,8 @@ spec: --build-arg WEIGHTS_PATH=/weights \ --build-arg VERSION={{VERSION}} \ --build-arg MODEL_TYPE={{MODEL_TYPE}} \ + --build-arg IMAGE_NAME={{IMAGE_NAME}} \ + --build-arg MODEL_VERSION={{MODEL_VERSION}} \ -f $DOCKERFILE_PATH / docker push $ACR_NAME.azurecr.io/{{IMAGE_NAME}}:$VERSION env: diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 5cdb8f98e..a5f100560 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -55,6 +55,7 @@ jobs: run: | PR_BRANCH=${{ env.BRANCH_NAME }} \ FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ + PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \ python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models diff --git a/docker/presets/inference/llama-2/Dockerfile b/docker/presets/inference/llama-2/Dockerfile index 285cb122a..4d85753d7 100644 --- a/docker/presets/inference/llama-2/Dockerfile +++ b/docker/presets/inference/llama-2/Dockerfile @@ -3,6 +3,8 @@ # --build-arg WEIGHTS_PATH=/weights \ # --build-arg VERSION={{VERSION}} \ # --build-arg MODEL_TYPE={{MODEL_TYPE}} \ +# --build-arg IMAGE_NAME={{IMAGE_NAME}} \ +# --build-arg MODEL_VERSION={{MODEL_VERSION}} \ FROM python:3.8-slim WORKDIR /workspace @@ -26,8 +28,12 @@ RUN pip install 'uvicorn[standard]' ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION -# Write the version to a file -RUN echo $VERSION > /workspace/llama/version.txt +ARG IMAGE_NAME +ARG MODEL_VERSION + +# Write metadata to model_info.json file +RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \ + echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/llama/model_info.json ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2 diff --git a/docker/presets/inference/tfs-onnx/Dockerfile b/docker/presets/inference/tfs-onnx/Dockerfile index 12e788346..8fdfc7440 100644 --- a/docker/presets/inference/tfs-onnx/Dockerfile +++ b/docker/presets/inference/tfs-onnx/Dockerfile @@ -4,12 +4,15 @@ FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu118-py38-torch211 ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION +ARG IMAGE_NAME +ARG MODEL_VERSION # Set the working directory WORKDIR /workspace/tfs -# Write the version to a file -RUN echo $VERSION > /workspace/tfs/version.txt +# Write metadata to model_info.json file +RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \ + echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json # First, copy just the requirements.txt file and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/docker/presets/inference/tfs/Dockerfile b/docker/presets/inference/tfs/Dockerfile index 5a322b8bd..863e40728 100644 --- a/docker/presets/inference/tfs/Dockerfile +++ b/docker/presets/inference/tfs/Dockerfile @@ -3,12 +3,15 @@ FROM python:3.10-slim ARG WEIGHTS_PATH ARG MODEL_TYPE ARG VERSION +ARG IMAGE_NAME +ARG MODEL_VERSION # Set the working directory WORKDIR /workspace/tfs -# Write the version to a file -RUN echo $VERSION > /workspace/tfs/version.txt +# Write metadata to model_info.json file +RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \ + echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json # First, copy just the preset files and install dependencies # This is done before copying the code to utilize Docker's layer caching and diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py index 11776bf3d..a91786e53 100644 --- a/presets/inference/llama2-chat/inference_api.py +++ b/presets/inference/llama2-chat/inference_api.py @@ -8,6 +8,7 @@ import signal import sys import threading +import json from typing import Optional import GPUtil @@ -18,6 +19,9 @@ from llama import Llama from pydantic import BaseModel +# Constants +MODEL_INFO = "model_info.json" + # Setup argparse parser = argparse.ArgumentParser(description="Llama API server.") parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.") @@ -191,6 +195,13 @@ def get_metrics(): except Exception as e: return {"error": str(e)} + @app_main.get("/version") + def get_version(): + with open(f"/workspace/llama/{MODEL_INFO}", "r") as f: + model_info = json.load(f) + + return model_info + def setup_worker_routes(): @app_worker.get("/healthz") def health_check(): diff --git a/presets/inference/llama2-completion/inference_api.py b/presets/inference/llama2-completion/inference_api.py index cf500146a..f29ba91e1 100644 --- a/presets/inference/llama2-completion/inference_api.py +++ b/presets/inference/llama2-completion/inference_api.py @@ -8,6 +8,7 @@ import signal import sys import threading +import json from typing import Optional import GPUtil @@ -18,6 +19,9 @@ from llama import Llama from pydantic import BaseModel +# Constants +MODEL_INFO = "model_info.json" + # Setup argparse parser = argparse.ArgumentParser(description="Llama API server.") parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.") @@ -180,6 +184,13 @@ def get_metrics(): except Exception as e: return {"error": str(e)} + @app_main.get("/version") + def get_version(): + with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f: + model_info = json.load(f) + + return model_info + def setup_worker_routes(): @app_worker.get("/healthz") def health_check(): diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json index 480fa97e4..8cdb9c16d 100644 --- a/presets/inference/text-generation/api_spec.json +++ b/presets/inference/text-generation/api_spec.json @@ -1,599 +1,658 @@ { "openapi": "3.1.0", "info": { - "title": "FastAPI", - "version": "0.1.0" + "title": "FastAPI", + "version": "0.1.0" }, "paths": { - "/": { - "get": { - "summary": "Home Endpoint", - "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.", - "operationId": "home__get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HomeResponse" - } - } - } - } + "/": { + "get": { + "summary": "Home Endpoint", + "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.", + "operationId": "home__get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HomeResponse" + } } + } } - }, - "/healthz": { - "get": { - "summary": "Health Check Endpoint", - "operationId": "health_check_healthz_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HealthStatus" - }, - "example": { - "status": "Healthy" - } - } - } - }, - "500": { - "description": "Error Response", - "content": { - "application/json": { - "examples": { - "model_uninitialized": { - "summary": "Model not initialized", - "value": { - "detail": "Model not initialized" - } - }, - "pipeline_uninitialized": { - "summary": "Pipeline not initialized", - "value": { - "detail": "Pipeline not initialized" - } - } - } - } - } + } + } + }, + "/healthz": { + "get": { + "summary": "Health Check Endpoint", + "operationId": "health_check_healthz_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HealthStatus" + }, + "example": { + "status": "Healthy" + } + } + } + }, + "500": { + "description": "Error Response", + "content": { + "application/json": { + "examples": { + "model_uninitialized": { + "summary": "Model not initialized", + "value": { + "detail": "Model not initialized" + } + }, + "pipeline_uninitialized": { + "summary": "Pipeline not initialized", + "value": { + "detail": "Pipeline not initialized" + } } + } } + } } - }, - "/chat": { - "post": { - "summary": "Chat Endpoint", - "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.", - "operationId": "generate_text_chat_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnifiedRequestModel" - }, - "examples": { - "text_generation_example": { - "summary": "Text Generation Example", - "description": "An example of a text generation request.", - "value": { - "prompt": "Tell me a joke", - "return_full_text": true, - "clean_up_tokenization_spaces": false, - "generate_kwargs": { - "max_length": 200, - "min_length": 0, - "do_sample": true, - "early_stopping": false, - "num_beams": 1, - "temperature": 1, - "top_k": 10, - "top_p": 1, - "typical_p": 1, - "repetition_penalty": 1, - "eos_token_id": 11 - } - } - }, - "conversation_example": { - "summary": "Conversation Example", - "description": "An example of a conversational request.", - "value": { - "messages": [ - { - "role": "user", - "content": "What is your favourite condiment?" - }, - { - "role": "assistant", - "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!" - }, - { - "role": "user", - "content": "Do you have mayonnaise recipes?" - } - ], - "return_full_text": true, - "clean_up_tokenization_spaces": false, - "generate_kwargs": { - "max_length": 200, - "min_length": 0, - "do_sample": true, - "early_stopping": false, - "num_beams": 1, - "temperature": 1, - "top_k": 10, - "top_p": 1, - "typical_p": 1, - "repetition_penalty": 1, - "eos_token_id": 11 - } - } - } - } - } - }, - "required": true + } + } + }, + "/chat": { + "post": { + "summary": "Chat Endpoint", + "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.", + "operationId": "generate_text_chat_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UnifiedRequestModel" }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {}, - "examples": { - "text_generation": { - "summary": "Text Generation Response", - "value": { - "Result": "Generated text based on the prompt." - } - }, - "conversation": { - "summary": "Conversation Response", - "value": { - "Result": "Response to the last message in the conversation." - } - } - } - } - } - }, - "400": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "examples": { - "missing_prompt": { - "summary": "Missing Prompt", - "value": { - "detail": "Text generation parameter prompt required" - } - }, - "missing_messages": { - "summary": "Missing Messages", - "value": { - "detail": "Conversational parameter messages required" - } - } - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } + "examples": { + "text_generation_example": { + "summary": "Text Generation Example", + "description": "An example of a text generation request.", + "value": { + "prompt": "Tell me a joke", + "return_full_text": true, + "clean_up_tokenization_spaces": false, + "generate_kwargs": { + "max_length": 200, + "min_length": 0, + "do_sample": true, + "early_stopping": false, + "num_beams": 1, + "temperature": 1, + "top_k": 10, + "top_p": 1, + "typical_p": 1, + "repetition_penalty": 1, + "eos_token_id": 11 + } + } + }, + "conversation_example": { + "summary": "Conversation Example", + "description": "An example of a conversational request.", + "value": { + "messages": [ + { + "role": "user", + "content": "What is your favourite condiment?" + }, + { + "role": "assistant", + "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!" + }, + { + "role": "user", + "content": "Do you have mayonnaise recipes?" } + ], + "return_full_text": true, + "clean_up_tokenization_spaces": false, + "generate_kwargs": { + "max_length": 200, + "min_length": 0, + "do_sample": true, + "early_stopping": false, + "num_beams": 1, + "temperature": 1, + "top_k": 10, + "top_p": 1, + "typical_p": 1, + "repetition_penalty": 1, + "eos_token_id": 11 + } + } + } + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {}, + "examples": { + "text_generation": { + "summary": "Text Generation Response", + "value": { + "Result": "Generated text based on the prompt." + } + }, + "conversation": { + "summary": "Conversation Response", + "value": { + "Result": "Response to the last message in the conversation." + } + } + } + } + } + }, + "400": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "examples": { + "missing_prompt": { + "summary": "Missing Prompt", + "value": { + "detail": "Text generation parameter prompt required" + } + }, + "missing_messages": { + "summary": "Missing Messages", + "value": { + "detail": "Conversational parameter messages required" + } } + } } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } } - }, - "/metrics": { - "get": { - "summary": "Metrics Endpoint", - "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.", - "operationId": "get_metrics_metrics_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/MetricsResponse" - }, - "examples": { - "gpu_metrics": { - "summary": "Example when GPUs are available", - "value": { - "gpu_info": [ - { - "id": "GPU-1234", - "name": "GeForce GTX 950", - "load": "25.00%", - "temperature": "55 C", - "memory": { - "used": "1.00 GB", - "total": "2.00 GB" - } - } - ] - } - }, - "cpu_metrics": { - "summary": "Example when only CPU is available", - "value": { - "cpu_info": { - "load_percentage": 20, - "physical_cores": 4, - "total_cores": 8, - "memory": { - "used": "4.00 GB", - "total": "16.00 GB" - } - } - } - } - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } + } + } + }, + "/metrics": { + "get": { + "summary": "Metrics Endpoint", + "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.", + "operationId": "get_metrics_metrics_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MetricsResponse" + }, + "examples": { + "gpu_metrics": { + "summary": "Example when GPUs are available", + "value": { + "gpu_info": [ + { + "id": "GPU-1234", + "name": "GeForce GTX 950", + "load": "25.00%", + "temperature": "55 C", + "memory": { + "used": "1.00 GB", + "total": "2.00 GB" } + } + ] + } + }, + "cpu_metrics": { + "summary": "Example when only CPU is available", + "value": { + "cpu_info": { + "load_percentage": 20, + "physical_cores": 4, + "total_cores": 8, + "memory": { + "used": "4.00 GB", + "total": "16.00 GB" + } } + } } + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } } + } } + } } - }, - "components": { - "schemas": { - "CPUInfo": { - "properties": { - "load_percentage": { - "type": "number", - "title": "Load Percentage" - }, - "physical_cores": { - "type": "integer", - "title": "Physical Cores" - }, - "total_cores": { - "type": "integer", - "title": "Total Cores" - }, - "memory": { - "$ref": "#/components/schemas/MemoryInfo" + }, + "/version": { + "get": { + "summary": "Get Model Information", + "description": "Reads and returns model version information from a predefined JSON file.", + "operationId": "get_version_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {}, + "examples": { + "model_info": { + "summary": "Model Information Response", + "value": { + "Model Type": "Your Model Type", + "Version": "1.0.0", + "Image Name": "model_image_name", + "Model Version URL": "http://example.com/model/version", + "REVISION_ID": "revision_hash" + } } - }, - "type": "object", - "required": [ - "load_percentage", - "physical_cores", - "total_cores", - "memory" - ], - "title": "CPUInfo" - }, - "ErrorResponse": { - "properties": { - "detail": { - "type": "string", - "title": "Detail" - } - }, - "type": "object", - "required": [ - "detail" - ], - "title": "ErrorResponse" - }, - "GPUInfo": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "name": { - "type": "string", - "title": "Name" - }, - "load": { - "type": "string", - "title": "Load" - }, - "temperature": { - "type": "string", - "title": "Temperature" - }, - "memory": { - "$ref": "#/components/schemas/MemoryInfo" - } - }, - "type": "object", - "required": [ - "id", - "name", - "load", - "temperature", - "memory" - ], - "title": "GPUInfo" - }, - "GenerateKwargs": { - "properties": { - "max_length": { - "type": "integer", - "title": "Max Length", - "default": 200 - }, - "min_length": { - "type": "integer", - "title": "Min Length", - "default": 0 - }, - "do_sample": { - "type": "boolean", - "title": "Do Sample", - "default": true - }, - "early_stopping": { - "type": "boolean", - "title": "Early Stopping", - "default": false - }, - "num_beams": { - "type": "integer", - "title": "Num Beams", - "default": 1 - }, - "temperature": { - "type": "number", - "title": "Temperature", - "default": 1 - }, - "top_k": { - "type": "integer", - "title": "Top K", - "default": 10 - }, - "top_p": { - "type": "number", - "title": "Top P", - "default": 1 - }, - "typical_p": { - "type": "number", - "title": "Typical P", - "default": 1 - }, - "repetition_penalty": { - "type": "number", - "title": "Repetition Penalty", - "default": 1 - }, - "pad_token_id": { - "type": "integer", - "title": "Pad Token Id" - }, - "eos_token_id": { - "type": "integer", - "title": "Eos Token Id", - "default": 11 - } - }, - "type": "object", - "title": "GenerateKwargs", - "example": { - "max_length": 200, - "temperature": 0.7, - "top_p": 0.9, - "additional_param": "Example value" + } } + } }, - "HTTPValidationError": { - "properties": { - "detail": { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array", - "title": "Detail" - } - }, - "type": "object", - "title": "HTTPValidationError" - }, - "HealthStatus": { - "properties": { - "status": { - "type": "string", - "title": "Status", - "example": "Healthy" - } - }, - "type": "object", - "required": [ - "status" - ], - "title": "HealthStatus" - }, - "HomeResponse": { - "properties": { - "message": { - "type": "string", - "title": "Message", - "example": "Server is running" - } - }, - "type": "object", - "required": [ - "message" - ], - "title": "HomeResponse" - }, - "MemoryInfo": { - "properties": { - "used": { - "type": "string", - "title": "Used" - }, - "total": { - "type": "string", - "title": "Total" + "404": { + "description": "Model Info Not Found", + "content": { + "application/json": { + "examples": { + "file_not_found": { + "summary": "Model Info File Not Found", + "value": { + "detail": "/workspace/tfs/model_info.json file not found." + } } - }, - "type": "object", - "required": [ - "used", - "total" - ], - "title": "MemoryInfo" - }, - "Message": { - "properties": { - "role": { - "type": "string", - "title": "Role" - }, - "content": { - "type": "string", - "title": "Content" - } - }, - "type": "object", - "required": [ - "role", - "content" - ], - "title": "Message" - }, - "MetricsResponse": { - "properties": { - "gpu_info": { - "items": { - "$ref": "#/components/schemas/GPUInfo" - }, - "type": "array", - "title": "Gpu Info" - }, - "cpu_info": { - "$ref": "#/components/schemas/CPUInfo" - } - }, - "type": "object", - "title": "MetricsResponse" - }, - "UnifiedRequestModel": { - "properties": { - "prompt": { - "type": "string", - "title": "Prompt", - "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'." - }, - "return_full_text": { - "type": "boolean", - "title": "Return Full Text", - "description": "Return full text if True, else only added text", - "default": true - }, - "clean_up_tokenization_spaces": { - "type": "boolean", - "title": "Clean Up Tokenization Spaces", - "description": "Clean up extra spaces in text output", - "default": false - }, - "prefix": { - "type": "string", - "title": "Prefix", - "description": "Prefix added to prompt" - }, - "handle_long_generation": { - "type": "string", - "title": "Handle Long Generation", - "description": "Strategy to handle long generation" - }, - "generate_kwargs": { - "allOf": [ - { - "$ref": "#/components/schemas/GenerateKwargs" - } - ], - "title": "Generate Kwargs", - "description": "Additional kwargs for generate method" - }, - "messages": { - "items": { - "$ref": "#/components/schemas/Message" - }, - "type": "array", - "title": "Messages", - "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'." - } - }, - "type": "object", - "title": "UnifiedRequestModel" - }, - "ValidationError": { - "properties": { - "loc": { - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] - }, - "type": "array", - "title": "Location" - }, - "msg": { - "type": "string", - "title": "Message" - }, - "type": { - "type": "string", - "title": "Error Type" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "examples": { + "unexpected_error": { + "summary": "Unexpected Error", + "value": { + "detail": "An unexpected error occurred on the server." + } } - }, - "type": "object", - "required": [ - "loc", - "msg", - "type" - ], - "title": "ValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "CPUInfo": { + "properties": { + "load_percentage": { + "type": "number", + "title": "Load Percentage" + }, + "physical_cores": { + "type": "integer", + "title": "Physical Cores" + }, + "total_cores": { + "type": "integer", + "title": "Total Cores" + }, + "memory": { + "$ref": "#/components/schemas/MemoryInfo" + } + }, + "type": "object", + "required": [ + "load_percentage", + "physical_cores", + "total_cores", + "memory" + ], + "title": "CPUInfo" + }, + "ErrorResponse": { + "properties": { + "detail": { + "type": "string", + "title": "Detail" + } + }, + "type": "object", + "required": [ + "detail" + ], + "title": "ErrorResponse" + }, + "GPUInfo": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "name": { + "type": "string", + "title": "Name" + }, + "load": { + "type": "string", + "title": "Load" + }, + "temperature": { + "type": "string", + "title": "Temperature" + }, + "memory": { + "$ref": "#/components/schemas/MemoryInfo" + } + }, + "type": "object", + "required": [ + "id", + "name", + "load", + "temperature", + "memory" + ], + "title": "GPUInfo" + }, + "GenerateKwargs": { + "properties": { + "max_length": { + "type": "integer", + "title": "Max Length", + "default": 200 + }, + "min_length": { + "type": "integer", + "title": "Min Length", + "default": 0 + }, + "do_sample": { + "type": "boolean", + "title": "Do Sample", + "default": true + }, + "early_stopping": { + "type": "boolean", + "title": "Early Stopping", + "default": false + }, + "num_beams": { + "type": "integer", + "title": "Num Beams", + "default": 1 + }, + "temperature": { + "type": "number", + "title": "Temperature", + "default": 1 + }, + "top_k": { + "type": "integer", + "title": "Top K", + "default": 10 + }, + "top_p": { + "type": "number", + "title": "Top P", + "default": 1 + }, + "typical_p": { + "type": "number", + "title": "Typical P", + "default": 1 + }, + "repetition_penalty": { + "type": "number", + "title": "Repetition Penalty", + "default": 1 + }, + "pad_token_id": { + "type": "integer", + "title": "Pad Token Id" + }, + "eos_token_id": { + "type": "integer", + "title": "Eos Token Id", + "default": 11 + } + }, + "type": "object", + "title": "GenerateKwargs", + "example": { + "max_length": 200, + "temperature": 0.7, + "top_p": 0.9, + "additional_param": "Example value" + } + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "HealthStatus": { + "properties": { + "status": { + "type": "string", + "title": "Status", + "example": "Healthy" + } + }, + "type": "object", + "required": [ + "status" + ], + "title": "HealthStatus" + }, + "HomeResponse": { + "properties": { + "message": { + "type": "string", + "title": "Message", + "example": "Server is running" + } + }, + "type": "object", + "required": [ + "message" + ], + "title": "HomeResponse" + }, + "MemoryInfo": { + "properties": { + "used": { + "type": "string", + "title": "Used" + }, + "total": { + "type": "string", + "title": "Total" + } + }, + "type": "object", + "required": [ + "used", + "total" + ], + "title": "MemoryInfo" + }, + "Message": { + "properties": { + "role": { + "type": "string", + "title": "Role" + }, + "content": { + "type": "string", + "title": "Content" + } + }, + "type": "object", + "required": [ + "role", + "content" + ], + "title": "Message" + }, + "MetricsResponse": { + "properties": { + "gpu_info": { + "items": { + "$ref": "#/components/schemas/GPUInfo" + }, + "type": "array", + "title": "Gpu Info" + }, + "cpu_info": { + "$ref": "#/components/schemas/CPUInfo" + } + }, + "type": "object", + "title": "MetricsResponse" + }, + "UnifiedRequestModel": { + "properties": { + "prompt": { + "type": "string", + "title": "Prompt", + "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'." + }, + "return_full_text": { + "type": "boolean", + "title": "Return Full Text", + "description": "Return full text if True, else only added text", + "default": true + }, + "clean_up_tokenization_spaces": { + "type": "boolean", + "title": "Clean Up Tokenization Spaces", + "description": "Clean up extra spaces in text output", + "default": false + }, + "prefix": { + "type": "string", + "title": "Prefix", + "description": "Prefix added to prompt" + }, + "handle_long_generation": { + "type": "string", + "title": "Handle Long Generation", + "description": "Strategy to handle long generation" + }, + "generate_kwargs": { + "allOf": [ + { + "$ref": "#/components/schemas/GenerateKwargs" + } + ], + "title": "Generate Kwargs", + "description": "Additional kwargs for generate method" + }, + "messages": { + "items": { + "$ref": "#/components/schemas/Message" + }, + "type": "array", + "title": "Messages", + "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'." + } + }, + "type": "object", + "title": "UnifiedRequestModel" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" } + } } -} \ No newline at end of file + } \ No newline at end of file diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index c23a15c6b..23dccba5d 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import json import os from dataclasses import asdict, dataclass, field from typing import Annotated, Any, Dict, List, Optional @@ -15,6 +16,10 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, GenerationConfig, HfArgumentParser) +# Constants +APP_DIR = "/workspace/tfs" +WEIGHTS_DIR = f"{APP_DIR}/weights" +MODEL_INFO_FILE = f"{APP_DIR}/model_info.json" @dataclass class ModelConfig: @@ -22,7 +27,7 @@ class ModelConfig: Transformers Model Configuration Parameters """ pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"}) - pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) + pretrained_model_name_or_path: Optional[str] = field(default=WEIGHTS_DIR, metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"}) cache_dir: Optional[str] = field(default=None, metadata={"help": "Cache directory for the model"}) from_tf: bool = field(default=False, metadata={"help": "Load model from a TensorFlow checkpoint"}) @@ -428,6 +433,74 @@ def get_metrics(): except Exception as e: raise HTTPException(status_code=500, detail=str(e)) +@app.get( + "/version", + summary="Get Model Information", + response_description="Model Version Information", + responses={ + 200: { + "description": "Successful Response", + "content": { + "application/json": { + "examples": { + "model_info": { + "summary": "Model Information Response", + "value": { + "Model Type": "Your Model Type", + "Version": "1.0.0", + "Image Name": "model_image_name", + "Model Version URL": "http://example.com/model/version", + "REVISION_ID": "revision_hash" + } + } + } + } + } + }, + 404: { + "description": "Model Info Not Found", + "content": { + "application/json": { + "examples": { + "file_not_found": { + "summary": "Model Info File Not Found", + "value": {"detail": f"{MODEL_INFO_FILE} file not found."} + } + } + } + } + }, + 500: { + "description": "Internal Server Error", + "content": { + "application/json": { + "examples": { + "unexpected_error": { + "summary": "Unexpected Error", + "value": { + "detail": "An unexpected error occurred on the server." + } + } + } + } + } + } + } +) +def get_version(): + """ + Reads and returns model version information from a predefined JSON file. + """ + try: + with open(MODEL_INFO_FILE, "r") as f: + model_info = json.load(f) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="model_info.json file not found.") + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + return model_info + if __name__ == "__main__": local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set port = 5000 + local_rank # Adjust port based on local rank diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index 0441a945a..ae57ff75d 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -3,28 +3,29 @@ models: - name: llama-2-7b type: llama2-completion runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-7b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-13b type: llama2-completion runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-13b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-70b type: llama2-completion runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 - name: llama-2-70b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.3 + tag: 0.0.4 # Tag history: + # 0.0.4 - Version endpoint (#297) # 0.0.3 - Inference API Cleanup (#233) # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244) # 0.0.1 - Initial Release @@ -34,28 +35,31 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - tag: 0.0.4 + tag: 0.0.5 - name: falcon-7b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - tag: 0.0.4 + tag: 0.0.5 # Tag history: + # 0.0.5 - Version endpoint (#297) # 0.0.4 - Adjust default model params (#310) # 0.0.3 - Update Default Params (#294) # 0.0.2 - Inference API Cleanup (#233) # 0.0.1 - Initial Release + - name: falcon-40b type: text-generation version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - tag: 0.0.5 + tag: 0.0.6 - name: falcon-40b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - tag: 0.0.5 + tag: 0.0.6 # Tag history for 40b models: + # 0.0.6 - Version endpoint (#297) # 0.0.5 - Adjust default model params (#310) # 0.0.4 - Skipped due to incomplete upload issue # 0.0.3 - Update Default Params (#294) @@ -67,13 +71,14 @@ models: type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 runtime: tfs - tag: 0.0.4 + tag: 0.0.5 - name: mistral-7b-instruct type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 runtime: tfs - tag: 0.0.4 + tag: 0.0.5 # Tag history: + # 0.0.5 - Version endpoint (#297) # 0.0.4 - Adjust default model params (#310) # 0.0.3 - Update Default Params (#294) # 0.0.2 - Inference API Cleanup (#233) @@ -84,8 +89,9 @@ models: type: text-generation version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 runtime: tfs - tag: 0.0.3 + tag: 0.0.4 # Tag history: + # 0.0.4 - Version endpoint (#297) # 0.0.3 - Adjust default model params (#310) # 0.0.2 - Update Default Params (#294) # 0.0.1 - Initial Release