From f5a0862f9823c74ea7662686c5c11b452f68ce0b Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 5 Feb 2024 21:23:49 -0800 Subject: [PATCH 01/24] feat: clean up inference, add validation checks --- .../text-generation/inference-api.py | 135 ++++++++++-------- .../text-generation/requirements.txt | 1 + 2 files changed, 77 insertions(+), 59 deletions(-) diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py index 7aa8c8ea4..de204401c 100644 --- a/presets/inference/text-generation/inference-api.py +++ b/presets/inference/text-generation/inference-api.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import argparse import os +from dataclasses import asdict, dataclass, field from typing import Any, Dict, List, Optional import GPUtil @@ -9,62 +9,54 @@ import transformers import uvicorn from fastapi import FastAPI, HTTPException -from pydantic import BaseModel, Field -from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig - +from pydantic import BaseModel, Extra, Field +from transformers import (AutoModelForCausalLM, AutoTokenizer, + GenerationConfig, HfArgumentParser) + + +@dataclass +class ModelConfig: + """ + HuggingFace Model Configuration Parameters + """ + pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"}) + pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) + state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"}) + cache_dir: Optional[str] = field(default=None, metadata={"help": "Cache directory for the model"}) + from_tf: bool = field(default=False, metadata={"help": "Load model from a TensorFlow checkpoint"}) + force_download: bool = field(default=False, metadata={"help": "Force the download of the model"}) + resume_download: bool = field(default=False, metadata={"help": "Resume an interrupted download"}) + proxies: Optional[str] = field(default=None, metadata={"help": "Proxy configuration for downloading the model"}) + output_loading_info: bool = field(default=False, metadata={"help": "Output additional loading information"}) + use_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"}) + revision: str = field(default="main", metadata={"help": "Specific model version to use"}) + trust_remote_code: bool = field(default=False, metadata={"help": "Enable trusting remote code when loading the model"}) + load_in_4bit: bool = field(default=False, metadata={"help": "Load model in 4-bit mode"}) + load_in_8bit: bool = field(default=False, metadata={"help": "Load model in 8-bit mode"}) + torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"}) + device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"}) + + def __post_init__(self): + if self.torch_dtype and not hasattr(torch, self.torch_dtype): + raise ValueError(f"Invalid torch dtype: {self.torch_dtype}") + self.torch_dtype = getattr(torch, self.torch_dtype) if self.torch_dtype else None + + supported_pipelines = {"conversational", "text-generation"} + if self.pipeline not in supported_pipelines: + raise ValueError(f"Unsupported pipeline: {self.pipeline}") + +parser = HfArgumentParser(ModelConfig) +args, unknown_args = parser.parse_args_into_dataclasses( + return_remaining_strings=True +) -def dtype_type(string): - if hasattr(torch, string): - return getattr(torch, string) - else: - raise ValueError(f"Invalid torch dtype: {string}") - -parser = argparse.ArgumentParser(description='Model Configuration') -parser.add_argument('--pipeline', required=True, type=str, help='The model pipeline for the pre-trained model') -parser.add_argument('--load_in_8bit', default=False, action='store_true', help='Load model in 8-bit mode') -parser.add_argument('--trust_remote_code', default=False, action='store_true', help='Enable trusting remote code when loading the model') -parser.add_argument('--torch_dtype', default=None, type=dtype_type, help='The torch dtype for the pre-trained model') -parser.add_argument('--device_map', default="auto", type=str, help='The device map for the pre-trained model') -parser.add_argument('--cache_dir', type=str, default=None, help='Cache directory for the model') -parser.add_argument('--from_tf', action='store_true', default=False, help='Load model from a TensorFlow checkpoint') -parser.add_argument('--force_download', action='store_true', default=False, help='Force the download of the model') -parser.add_argument('--resume_download', action='store_true', default=False, help='Resume an interrupted download') -parser.add_argument('--proxies', type=str, default=None, help='Proxy configuration for downloading the model') -parser.add_argument('--revision', type=str, default="main", help='Specific model version to use') -# parser.add_argument('--local_files_only', action='store_true', default=False, help='Only use local files for model loading') -parser.add_argument('--output_loading_info', action='store_true', default=False, help='Output additional loading information') - -args = parser.parse_args() +model_args = asdict(args) +model_args["local_files_only"] = not model_args.pop('use_remote_files') +model_pipeline = model_args.pop('pipeline') app = FastAPI() - -supported_pipelines = {"conversational", "text-generation"} -if args.pipeline not in supported_pipelines: - raise HTTPException(status_code=400, detail="Invalid pipeline specified") - -model_kwargs = { - "cache_dir": args.cache_dir, - "from_tf": args.from_tf, - "force_download": args.force_download, - "resume_download": args.resume_download, - "proxies": args.proxies, - "revision": args.revision, - "output_loading_info": args.output_loading_info, - "trust_remote_code": args.trust_remote_code, - "device_map": args.device_map, - "local_files_only": True, -} - -if args.load_in_8bit: - model_kwargs["load_in_8bit"] = args.load_in_8bit -if args.torch_dtype: - model_kwargs["torch_dtype"] = args.torch_dtype - -tokenizer = AutoTokenizer.from_pretrained("/workspace/tfs/weights", **model_kwargs) -model = AutoModelForCausalLM.from_pretrained( - "/workspace/tfs/weights", - **model_kwargs -) +tokenizer = AutoTokenizer.from_pretrained(**model_args) +model = AutoModelForCausalLM.from_pretrained(**model_args) pipeline_kwargs = { "trust_remote_code": args.trust_remote_code, @@ -75,7 +67,7 @@ def dtype_type(string): pipeline_kwargs["torch_dtype"] = args.torch_dtype pipeline = transformers.pipeline( - args.pipeline, + model_pipeline, model=model, tokenizer=tokenizer, **pipeline_kwargs @@ -101,17 +93,42 @@ def health_check(): raise HTTPException(status_code=500, detail="Pipeline not initialized") return {"status": "Healthy"} +class GenerateKwargs(BaseModel): + max_length: int = 200 + min_length: int = 0 + do_sample: bool = True + early_stopping: bool = False + num_beams: int = 1 + num_beam_groups: int = 1 + diversity_penalty: float = 0.0 + temperature: float = 1.0 + top_k: int = 10 + top_p: float = 1 + typical_p: float = 1 + repetition_penalty: float = 1 + length_penalty: float = 1 + no_repeat_ngram_size: int = 0 + encoder_no_repeat_ngram_size: int = 0 + bad_words_ids: Optional[List[int]] = None + num_return_sequences: int = 1 + output_scores: bool = False + return_dict_in_generate: bool = False + pad_token_id: Optional[int] = tokenizer.pad_token_id + eos_token_id: Optional[int] = tokenizer.eos_token_id + forced_bos_token_id: Optional[int] = None + forced_eos_token_id: Optional[int] = None + remove_invalid_values: Optional[bool] = None + class Config: + extra = Extra.allow # Allows for additional fields not explicitly defined + class UnifiedRequestModel(BaseModel): # Fields for text generation prompt: Optional[str] = Field(None, description="Prompt for text generation") - # Mutually Exclusive with return_full_text - # return_tensors: Optional[bool] = Field(False, description="Return tensors of predictions") - # return_text: Optional[bool] = Field(True, description="Return decoded texts in the outputs") return_full_text: Optional[bool] = Field(True, description="Return full text if True, else only added text") clean_up_tokenization_spaces: Optional[bool] = Field(False, description="Clean up extra spaces in text output") prefix: Optional[str] = Field(None, description="Prefix added to prompt") handle_long_generation: Optional[str] = Field(None, description="Strategy to handle long generation") - generate_kwargs: Optional[Dict[str, Any]] = Field(None, description="Additional kwargs for generate method") + generate_kwargs: Optional[GenerateKwargs] = Field(None, description="Additional kwargs for generate method") # Field for conversational model messages: Optional[List[dict]] = Field(None, description="Messages for conversational model") diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 2e8a5e33e..424159dba 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -6,5 +6,6 @@ fastapi==0.103.2 pydantic==1.10.9 uvicorn[standard]==0.23.2 bitsandbytes==0.41.1 +scipy==1.10.1 deepspeed==0.11.1 gputil==1.4.0 \ No newline at end of file From 8e7f6c3233490c18b17b1ba1375c9e9ee46bf848 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 6 Feb 2024 10:56:22 -0800 Subject: [PATCH 02/24] feat: unknown arg parsing added --- .../text-generation/inference-api.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py index de204401c..145a8db12 100644 --- a/presets/inference/text-generation/inference-api.py +++ b/presets/inference/text-generation/inference-api.py @@ -17,7 +17,7 @@ @dataclass class ModelConfig: """ - HuggingFace Model Configuration Parameters + Transformers Model Configuration Parameters """ pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"}) pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) @@ -36,7 +36,31 @@ class ModelConfig: torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"}) device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"}) + # Method to process unknown arguments + def process_unknown_args(self, unknown_args: List[str]): + """ + Process unknown cmd line args and update the model configuration accordingly. + """ + unknown_args_dict = {} + i = 0 + while i < len(unknown_args): + key = unknown_args[i].lstrip('-') # Remove leading dashes + if i + 1 < len(unknown_args) and not unknown_args[i + 1].startswith('--'): + value = unknown_args[i + 1] + i += 2 # Move past the current key-value pair + else: + value = True # Assign a True value for standalone flags + i += 1 # Move to the next item + + unknown_args_dict[key] = value + + # Update the ModelConfig instance with the unknown args + self.__dict__.update(unknown_args_dict) + def __post_init__(self): + """ + Post-initialization to validate some ModelConfig values + """ if self.torch_dtype and not hasattr(torch, self.torch_dtype): raise ValueError(f"Invalid torch dtype: {self.torch_dtype}") self.torch_dtype = getattr(torch, self.torch_dtype) if self.torch_dtype else None @@ -50,6 +74,8 @@ def __post_init__(self): return_remaining_strings=True ) +args.process_unknown_args(unknown_args) + model_args = asdict(args) model_args["local_files_only"] = not model_args.pop('use_remote_files') model_pipeline = model_args.pop('pipeline') From 8cc3f2ea42179b73620638dc30f4df241032475d Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 6 Feb 2024 11:17:52 -0800 Subject: [PATCH 03/24] feat: rename func --- .../text-generation/inference-api.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py index 145a8db12..c41cffa8e 100644 --- a/presets/inference/text-generation/inference-api.py +++ b/presets/inference/text-generation/inference-api.py @@ -36,26 +36,26 @@ class ModelConfig: torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"}) device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"}) - # Method to process unknown arguments - def process_unknown_args(self, unknown_args: List[str]): + # Method to process additional arguments + def process_additional_args(self, addt_args: List[str]): """ - Process unknown cmd line args and update the model configuration accordingly. + Process additional cmd line args and update the model configuration accordingly. """ - unknown_args_dict = {} + addt_args_dict = {} i = 0 - while i < len(unknown_args): - key = unknown_args[i].lstrip('-') # Remove leading dashes - if i + 1 < len(unknown_args) and not unknown_args[i + 1].startswith('--'): - value = unknown_args[i + 1] + while i < len(addt_args): + key = addt_args[i].lstrip('-') # Remove leading dashes + if i + 1 < len(addt_args) and not addt_args[i + 1].startswith('--'): + value = addt_args[i + 1] i += 2 # Move past the current key-value pair else: value = True # Assign a True value for standalone flags i += 1 # Move to the next item - unknown_args_dict[key] = value + addt_args_dict[key] = value - # Update the ModelConfig instance with the unknown args - self.__dict__.update(unknown_args_dict) + # Update the ModelConfig instance with the additional args + self.__dict__.update(addt_args_dict) def __post_init__(self): """ @@ -70,11 +70,11 @@ def __post_init__(self): raise ValueError(f"Unsupported pipeline: {self.pipeline}") parser = HfArgumentParser(ModelConfig) -args, unknown_args = parser.parse_args_into_dataclasses( +args, additional_args = parser.parse_args_into_dataclasses( return_remaining_strings=True ) -args.process_unknown_args(unknown_args) +args.process_additional_args(additional_args) model_args = asdict(args) model_args["local_files_only"] = not model_args.pop('use_remote_files') From b6351097ab024311388f85e8b3b5c59b44f1776a Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 6 Feb 2024 11:24:56 -0800 Subject: [PATCH 04/24] fix: library version --- presets/inference/text-generation/inference-api.py | 3 ++- presets/inference/text-generation/requirements.txt | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py index c41cffa8e..3fa9ddac9 100644 --- a/presets/inference/text-generation/inference-api.py +++ b/presets/inference/text-generation/inference-api.py @@ -101,7 +101,8 @@ def __post_init__(self): try: # Attempt to load the generation configuration - default_generate_config = GenerationConfig.from_pretrained("/workspace/tfs/weights", local_files_only=True).to_dict() + default_generate_config = GenerationConfig.from_pretrained( + args.pretrained_model_name_or_path, local_files_only=True).to_dict() except Exception as e: default_generate_config = {} diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 424159dba..93cf064d9 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -5,7 +5,6 @@ accelerate==0.23.0 fastapi==0.103.2 pydantic==1.10.9 uvicorn[standard]==0.23.2 -bitsandbytes==0.41.1 -scipy==1.10.1 +bitsandbytes==0.42.0 deepspeed==0.11.1 gputil==1.4.0 \ No newline at end of file From 6208731b5276a3100e1862fc0724b004b3336e76 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 6 Feb 2024 11:30:36 -0800 Subject: [PATCH 05/24] fix: formatting --- presets/inference/text-generation/inference-api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py index 3fa9ddac9..d84379cab 100644 --- a/presets/inference/text-generation/inference-api.py +++ b/presets/inference/text-generation/inference-api.py @@ -102,7 +102,9 @@ def __post_init__(self): try: # Attempt to load the generation configuration default_generate_config = GenerationConfig.from_pretrained( - args.pretrained_model_name_or_path, local_files_only=True).to_dict() + args.pretrained_model_name_or_path, + local_files_only=args.local_files_only + ).to_dict() except Exception as e: default_generate_config = {} From 8d1757749008d7454c7072f98a9f089978e8140e Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 12 Feb 2024 15:51:06 -0800 Subject: [PATCH 06/24] fix: add some additional params --- presets/inference/text-generation/inference-api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference-api.py index d84379cab..764d135aa 100644 --- a/presets/inference/text-generation/inference-api.py +++ b/presets/inference/text-generation/inference-api.py @@ -28,7 +28,7 @@ class ModelConfig: resume_download: bool = field(default=False, metadata={"help": "Resume an interrupted download"}) proxies: Optional[str] = field(default=None, metadata={"help": "Proxy configuration for downloading the model"}) output_loading_info: bool = field(default=False, metadata={"help": "Output additional loading information"}) - use_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"}) + allow_remote_files: bool = field(default=False, metadata={"help": "Allow using remote files, default is local only"}) revision: str = field(default="main", metadata={"help": "Specific model version to use"}) trust_remote_code: bool = field(default=False, metadata={"help": "Enable trusting remote code when loading the model"}) load_in_4bit: bool = field(default=False, metadata={"help": "Load model in 4-bit mode"}) @@ -77,7 +77,7 @@ def __post_init__(self): args.process_additional_args(additional_args) model_args = asdict(args) -model_args["local_files_only"] = not model_args.pop('use_remote_files') +model_args["local_files_only"] = not model_args.pop('allow_remote_files') model_pipeline = model_args.pop('pipeline') app = FastAPI() From 02189bacdcdf4f1c0c640b80ea79106744486095 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 12 Feb 2024 20:31:58 -0800 Subject: [PATCH 07/24] fix: rename inference-api.py to support python naming --- docker/presets/tfs-onnx/Dockerfile | 2 +- docker/presets/tfs/Dockerfile | 2 +- pkg/inference/preset-inferences.go | 2 +- pkg/inference/preset-inferences_test.go | 4 +- .../{inference-api.py => inference_api.py} | 0 .../{inference-api.py => inference_api.py} | 0 .../{inference-api.py => inference_api.py} | 0 .../text-generation/test_inference_api.py | 84 +++++++++++++++++++ .../falcon-40b-instruct.yaml | 2 +- .../test/manifests/falcon-40b/falcon-40b.yaml | 2 +- .../falcon-7b-instruct.yaml | 2 +- .../test/manifests/falcon-7b/falcon-7b.yaml | 2 +- .../llama-2-13b-chat/llama-2-13b-chat.yaml | 2 +- .../manifests/llama-2-13b/llama-2-13b.yaml | 2 +- .../llama-2-7b-chat/llama-2-7b-chat.yaml | 2 +- .../test/manifests/llama-2-7b/llama-2-7b.yaml | 2 +- .../mistral-7b-instruct.yaml | 2 +- .../test/manifests/mistral-7b/mistral-7b.yaml | 2 +- presets/test/manifests/phi-2/phi-2.yaml | 2 +- 19 files changed, 100 insertions(+), 16 deletions(-) rename presets/inference/llama2-chat/{inference-api.py => inference_api.py} (100%) rename presets/inference/llama2-completion/{inference-api.py => inference_api.py} (100%) rename presets/inference/text-generation/{inference-api.py => inference_api.py} (100%) create mode 100644 presets/inference/text-generation/test_inference_api.py diff --git a/docker/presets/tfs-onnx/Dockerfile b/docker/presets/tfs-onnx/Dockerfile index 26b92f8ff..12e788346 100644 --- a/docker/presets/tfs-onnx/Dockerfile +++ b/docker/presets/tfs-onnx/Dockerfile @@ -17,7 +17,7 @@ RUN echo $VERSION > /workspace/tfs/version.txt COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt RUN pip install --no-cache-dir -r requirements.txt -COPY kaito/presets/inference/${MODEL_TYPE}/inference-api.py /workspace/tfs/inference-api.py +COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py # Convert to ONNX Runtime # RUN python convert_to_onnx.py ${MODEL_NAME} diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index e5826027b..9ddcf9b93 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -16,7 +16,7 @@ RUN echo $VERSION > /workspace/tfs/version.txt COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt RUN pip install --no-cache-dir -r requirements.txt -COPY kaito/presets/inference/${MODEL_TYPE}/inference-api.py /workspace/tfs/inference-api.py +COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py # Copy the entire model weights to the weights directory COPY ${WEIGHTS_PATH} /workspace/tfs/weights diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go index 7d49bfece..9b02012b7 100644 --- a/pkg/inference/preset-inferences.go +++ b/pkg/inference/preset-inferences.go @@ -21,7 +21,7 @@ import ( const ( ProbePath = "/healthz" Port5000 = int32(5000) - InferenceFile = "inference-api.py" + InferenceFile = "inference_api.py" DefaultVolumeMountPath = "/dev/shm" ) diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go index bb73b7894..cd8df067c 100644 --- a/pkg/inference/preset-inferences_test.go +++ b/pkg/inference/preset-inferences_test.go @@ -37,7 +37,7 @@ func TestCreatePresetInference(t *testing.T) { workload: "Deployment", // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams // So expected cmd consists of shell command and inference file - expectedCmd: "/bin/sh -c inference-api.py", + expectedCmd: "/bin/sh -c inference_api.py", }, "test-distributed-model": { @@ -48,7 +48,7 @@ func TestCreatePresetInference(t *testing.T) { c.On("Create", mock.IsType(context.Background()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil) }, workload: "StatefulSet", - expectedCmd: "/bin/sh -c inference-api.py", + expectedCmd: "/bin/sh -c inference_api.py", }, } diff --git a/presets/inference/llama2-chat/inference-api.py b/presets/inference/llama2-chat/inference_api.py similarity index 100% rename from presets/inference/llama2-chat/inference-api.py rename to presets/inference/llama2-chat/inference_api.py diff --git a/presets/inference/llama2-completion/inference-api.py b/presets/inference/llama2-completion/inference_api.py similarity index 100% rename from presets/inference/llama2-completion/inference-api.py rename to presets/inference/llama2-completion/inference_api.py diff --git a/presets/inference/text-generation/inference-api.py b/presets/inference/text-generation/inference_api.py similarity index 100% rename from presets/inference/text-generation/inference-api.py rename to presets/inference/text-generation/inference_api.py diff --git a/presets/inference/text-generation/test_inference_api.py b/presets/inference/text-generation/test_inference_api.py new file mode 100644 index 000000000..81561d1b7 --- /dev/null +++ b/presets/inference/text-generation/test_inference_api.py @@ -0,0 +1,84 @@ +from fastapi import FastAPI +from fastapi.testclient import TestClient +from inference_api import app + +client = TestClient(app) + +# Non-Inference Endpoints +def test_read_main(): + response = client.get("/") + assert response.status_code == 200 + assert response.json() == "Server is running" + +def test_health_check(): + response = client.get("/healthz") + # Assume we have a GPU available and the model & pipeline initialized for testing + assert response.status_code == 200 + assert response.json() == {"status": "Healthy"} + +def test_get_metrics(): + response = client.get("/metrics") + assert response.status_code == 200 + # Check the structure of the response to ensure GPU metrics are returned + assert "gpu_info" in response.json() + +# Inference Endpoint +def test_text_generation(): + request_data = { + "prompt": "Hello, world!", + "return_full_text": True, + "clean_up_tokenization_spaces": False, + "generate_kwargs": {"max_length": 50, "min_length": 10} # Example generate_kwargs + } + response = client.post("/chat", json=request_data) + assert response.status_code == 200 + data = response.json() + assert "Result" in data + assert len(data["Result"]) > 0 # Check if the result text is not empty + +def test_conversational(): + messages = [ + {"role": "user", "content": "What is your favourite condiment?"}, + {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, + {"role": "user", "content": "Do you have mayonnaise recipes?"} + ] + request_data = { + "messages": messages, + "generate_kwargs": {"max_length": 50} # Example generate_kwargs for conversational + } + response = client.post("/chat", json=request_data) + assert response.status_code == 200 + data = response.json() + assert "Result" in data + assert len(data["Result"]) > 0 # Check if the conversation result is not empty + +# Invalid tests +def test_invalid_pipeline(): + request_data = { + "prompt": "This should fail", + "pipeline": "invalid-pipeline" # Invalid pipeline type + } + response = client.post("/chat", json=request_data) + assert response.status_code == 400 # Expecting a Bad Request response + assert "Invalid pipeline type" in response.json().get("detail", "") + +def test_missing_prompt(): + request_data = { + # "prompt" is missing + "return_full_text": True, + "clean_up_tokenization_spaces": False + } + response = client.post("/chat", json=request_data) + assert response.status_code == 400 # Expecting a Bad Request response due to missing prompt + assert "Text generation parameter prompt required" in response.json().get("detail", "") + +def test_missing_messages_for_conversation(): + request_data = { + # "messages" is missing for conversational pipeline + "pipeline": "conversational" + } + response = client.post("/chat", json=request_data) + assert response.status_code == 400 # Expecting a Bad Request response due to missing messages + assert "Conversational parameter messages required" in response.json().get("detail", "") + + diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml index bd6280b9f..04c49b6ce 100644 --- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml +++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 4 # Requesting 4 GPUs diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml index a125d838d..96610a9fb 100644 --- a/presets/test/manifests/falcon-40b/falcon-40b.yaml +++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 4 # Requesting 4 GPUs diff --git a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml index ed8913e76..d742ee08b 100644 --- a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml +++ b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/test/manifests/falcon-7b/falcon-7b.yaml b/presets/test/manifests/falcon-7b/falcon-7b.yaml index 2f1aff077..ed86043e7 100644 --- a/presets/test/manifests/falcon-7b/falcon-7b.yaml +++ b/presets/test/manifests/falcon-7b/falcon-7b.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml index 2cf9867e7..7ab7dcb21 100644 --- a/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml +++ b/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml @@ -35,7 +35,7 @@ spec: - | echo "MASTER_ADDR: $MASTER_ADDR" NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$') - cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference-api.py + cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py resources: limits: nvidia.com/gpu: "1" diff --git a/presets/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/test/manifests/llama-2-13b/llama-2-13b.yaml index b3d885ce7..46c609bbb 100644 --- a/presets/test/manifests/llama-2-13b/llama-2-13b.yaml +++ b/presets/test/manifests/llama-2-13b/llama-2-13b.yaml @@ -35,7 +35,7 @@ spec: - | echo "MASTER_ADDR: $MASTER_ADDR" NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$') - cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference-api.py + cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py resources: limits: nvidia.com/gpu: "1" diff --git a/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml index b38cbfe3b..f26b003a8 100644 --- a/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml +++ b/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - cd /workspace/llama/llama-2 && torchrun inference-api.py + - cd /workspace/llama/llama-2 && torchrun inference_api.py resources: limits: nvidia.com/gpu: "1" diff --git a/presets/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/test/manifests/llama-2-7b/llama-2-7b.yaml index 3e973bcd0..f68d43c64 100644 --- a/presets/test/manifests/llama-2-7b/llama-2-7b.yaml +++ b/presets/test/manifests/llama-2-7b/llama-2-7b.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - cd /workspace/llama/llama-2 && torchrun inference-api.py + - cd /workspace/llama/llama-2 && torchrun inference_api.py resources: limits: nvidia.com/gpu: "1" diff --git a/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml index cacfbd484..e998ce0ed 100644 --- a/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml +++ b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/test/manifests/mistral-7b/mistral-7b.yaml b/presets/test/manifests/mistral-7b/mistral-7b.yaml index 287d435a7..5521ef2f8 100644 --- a/presets/test/manifests/mistral-7b/mistral-7b.yaml +++ b/presets/test/manifests/mistral-7b/mistral-7b.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/test/manifests/phi-2/phi-2.yaml b/presets/test/manifests/phi-2/phi-2.yaml index b250d6248..9882b80cb 100644 --- a/presets/test/manifests/phi-2/phi-2.yaml +++ b/presets/test/manifests/phi-2/phi-2.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 From 8dd38ac0e1661aeb1d1d2a17673fcccd7e99894d Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 12 Feb 2024 23:56:01 -0800 Subject: [PATCH 08/24] fix: add tests --- .../text-generation/inference_api.py | 4 +- .../text-generation/tests/run_tests.sh | 10 ++ .../tests/test_inference_api.py | 125 ++++++++++++++++++ 3 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 presets/inference/text-generation/tests/run_tests.sh create mode 100644 presets/inference/text-generation/tests/test_inference_api.py diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index 764d135aa..73c7b5095 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -160,11 +160,11 @@ class UnifiedRequestModel(BaseModel): generate_kwargs: Optional[GenerateKwargs] = Field(None, description="Additional kwargs for generate method") # Field for conversational model - messages: Optional[List[dict]] = Field(None, description="Messages for conversational model") + messages: Optional[List[Dict[str, str]]] = Field(None, description="Messages for conversational model") @app.post("/chat") def generate_text(request_model: UnifiedRequestModel): - user_generate_kwargs = request_model.generate_kwargs or {} + user_generate_kwargs = request_model.generate_kwargs.dict() if request_model.generate_kwargs else {} generate_kwargs = {**default_generate_config, **user_generate_kwargs} if args.pipeline == "text-generation": diff --git a/presets/inference/text-generation/tests/run_tests.sh b/presets/inference/text-generation/tests/run_tests.sh new file mode 100644 index 000000000..6b4104e38 --- /dev/null +++ b/presets/inference/text-generation/tests/run_tests.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo "Running text-generation tests..." +python3 test_inference_api.py --pipeline text-generation --pretrained_model_name_or_path microsoft/phi-2 --allow_remote_files True + +echo "Running conversational tests..." +python3 test_inference_api.py --pipeline conversational --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True + +echo "Running invalid-pipeline tests..." +python3 test_inference_api.py --pipeline invalid-pipeline --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py new file mode 100644 index 000000000..4d752a831 --- /dev/null +++ b/presets/inference/text-generation/tests/test_inference_api.py @@ -0,0 +1,125 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import sys +from pathlib import Path + +# Get the parent directory of the current file +parent_dir = str(Path(__file__).resolve().parent.parent) +# Add the parent directory to sys.path +sys.path.append(parent_dir) + +import argparse +from unittest.mock import patch + +from fastapi.testclient import TestClient + +# Parse the command-line arguments +parser = argparse.ArgumentParser() +parser.add_argument("--pipeline", required=True, help="Pipeline type") +parser.add_argument("--pretrained_model_name_or_path", required=True, help="Model path") +parser.add_argument("--allow_remote_files", default=True, help="Allow models to be downloaded for tests") +args = parser.parse_args() +pipeline_type = args.pipeline + +try: + from inference_api import ModelConfig, app +except ValueError as e: + if pipeline_type not in {"text-generation", "conversational"}: + # Pipeline is invalid, handle and exit + print(f"Correctly caught invalid pipeline during import") + sys.exit(0) + else: + raise +except Exception as e: + # For all other exceptions, re-raise + raise + +def run_tests(): + client = TestClient(app) + test_read_main(client) + test_health_check(client) + test_get_metrics(client) + test_get_metrics_no_gpus(client) + # Pipeline must be valid to pass import + if pipeline_type == "text-generation": + test_text_generation(client) + test_missing_prompt(client) + elif pipeline_type == "conversational": + test_conversational(client) + test_missing_messages_for_conversation(client) + +def test_read_main(client): + response = client.get("/") + server_msg, status_code = response.json() + assert server_msg == "Server is running" + assert status_code == 200 + +def test_health_check(client): + response = client.get("/healthz") + # Assume we have a GPU available and the model & pipeline initialized for testing + assert response.status_code == 200 + assert response.json() == {"status": "Healthy"} + +def test_get_metrics(client): + response = client.get("/metrics") + assert response.status_code == 200 + # Check the structure of the response to ensure GPU metrics are returned + assert "gpu_info" in response.json() + +def test_get_metrics_no_gpus(client): + with patch('GPUtil.getGPUs', return_value=[]) as mock_getGPUs: + response = client.get("/metrics") + assert response.status_code == 200 + assert response.json()["gpu_info"] == [] # Expecting an empty list + +def test_text_generation(client): + request_data = { + "prompt": "Hello, world!", + "return_full_text": True, + "clean_up_tokenization_spaces": False, + "generate_kwargs": {"max_length": 50, "min_length": 10} # Example generate_kwargs + } + response = client.post("/chat", json=request_data) + assert response.status_code == 200 + data = response.json() + assert "Result" in data + assert len(data["Result"]) > 0 # Check if the result text is not empty + +def test_missing_prompt(client): + request_data = { + # "prompt" is missing + "return_full_text": True, + "clean_up_tokenization_spaces": False, + "generate_kwargs": {"max_length": 50} + } + response = client.post("/chat", json=request_data) + assert response.status_code == 400 # Expecting a Bad Request response due to missing prompt + assert "Text generation parameter prompt required" in response.json().get("detail", "") + +def test_conversational(client): + messages = [ + {"role": "user", "content": "What is your favourite condiment?"}, + {"role": "assistant", "content": "Well, Im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever Im cooking up in the kitchen!"}, + {"role": "user", "content": "Do you have mayonnaise recipes?"} + ] + request_data = { + "messages": messages, + "generate_kwargs": {"max_new_tokens": 1000, "do_sample": True} + } + response = client.post("/chat", json=request_data) + + assert response.status_code == 200 + data = response.json() + assert "Result" in data + assert len(data["Result"]) > 0 # Check if the conversation result is not empty + +def test_missing_messages_for_conversation(client): + request_data = { + # "messages" is missing for conversational pipeline + } + response = client.post("/chat", json=request_data) + assert response.status_code == 400 # Expecting a Bad Request response due to missing messages + assert "Conversational parameter messages required" in response.json().get("detail", "") + +if __name__ == "__main__": + run_tests() \ No newline at end of file From b97712dec83a4eee4c00390a85c4eaa1cc5460b8 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 12 Feb 2024 23:57:16 -0800 Subject: [PATCH 09/24] fix: remove --- .../text-generation/test_inference_api.py | 84 ------------------- 1 file changed, 84 deletions(-) delete mode 100644 presets/inference/text-generation/test_inference_api.py diff --git a/presets/inference/text-generation/test_inference_api.py b/presets/inference/text-generation/test_inference_api.py deleted file mode 100644 index 81561d1b7..000000000 --- a/presets/inference/text-generation/test_inference_api.py +++ /dev/null @@ -1,84 +0,0 @@ -from fastapi import FastAPI -from fastapi.testclient import TestClient -from inference_api import app - -client = TestClient(app) - -# Non-Inference Endpoints -def test_read_main(): - response = client.get("/") - assert response.status_code == 200 - assert response.json() == "Server is running" - -def test_health_check(): - response = client.get("/healthz") - # Assume we have a GPU available and the model & pipeline initialized for testing - assert response.status_code == 200 - assert response.json() == {"status": "Healthy"} - -def test_get_metrics(): - response = client.get("/metrics") - assert response.status_code == 200 - # Check the structure of the response to ensure GPU metrics are returned - assert "gpu_info" in response.json() - -# Inference Endpoint -def test_text_generation(): - request_data = { - "prompt": "Hello, world!", - "return_full_text": True, - "clean_up_tokenization_spaces": False, - "generate_kwargs": {"max_length": 50, "min_length": 10} # Example generate_kwargs - } - response = client.post("/chat", json=request_data) - assert response.status_code == 200 - data = response.json() - assert "Result" in data - assert len(data["Result"]) > 0 # Check if the result text is not empty - -def test_conversational(): - messages = [ - {"role": "user", "content": "What is your favourite condiment?"}, - {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, - {"role": "user", "content": "Do you have mayonnaise recipes?"} - ] - request_data = { - "messages": messages, - "generate_kwargs": {"max_length": 50} # Example generate_kwargs for conversational - } - response = client.post("/chat", json=request_data) - assert response.status_code == 200 - data = response.json() - assert "Result" in data - assert len(data["Result"]) > 0 # Check if the conversation result is not empty - -# Invalid tests -def test_invalid_pipeline(): - request_data = { - "prompt": "This should fail", - "pipeline": "invalid-pipeline" # Invalid pipeline type - } - response = client.post("/chat", json=request_data) - assert response.status_code == 400 # Expecting a Bad Request response - assert "Invalid pipeline type" in response.json().get("detail", "") - -def test_missing_prompt(): - request_data = { - # "prompt" is missing - "return_full_text": True, - "clean_up_tokenization_spaces": False - } - response = client.post("/chat", json=request_data) - assert response.status_code == 400 # Expecting a Bad Request response due to missing prompt - assert "Text generation parameter prompt required" in response.json().get("detail", "") - -def test_missing_messages_for_conversation(): - request_data = { - # "messages" is missing for conversational pipeline - "pipeline": "conversational" - } - response = client.post("/chat", json=request_data) - assert response.status_code == 400 # Expecting a Bad Request response due to missing messages - assert "Conversational parameter messages required" in response.json().get("detail", "") - - From 82bdb9715fef17fbaa54e74a6a1de6a5cf3b08cd Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 13 Feb 2024 15:20:41 -0800 Subject: [PATCH 10/24] fix: added comprehensive tests --- .../text-generation/requirements.txt | 5 +- .../text-generation/tests/run_tests.sh | 10 -- .../tests/test_inference_api.py | 170 +++++++++--------- .../tests/test_model_config.py | 89 +++++++++ 4 files changed, 178 insertions(+), 96 deletions(-) delete mode 100644 presets/inference/text-generation/tests/run_tests.sh create mode 100644 presets/inference/text-generation/tests/test_model_config.py diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 3bbbb3ab9..46e964b23 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -7,4 +7,7 @@ pydantic==1.10.9 uvicorn[standard]==0.23.2 bitsandbytes==0.42.0 deepspeed==0.11.1 -gputil==1.4.0 \ No newline at end of file +gputil==1.4.0 +# For UTs +pytest=8.0.0 +httpx=0.26.0 \ No newline at end of file diff --git a/presets/inference/text-generation/tests/run_tests.sh b/presets/inference/text-generation/tests/run_tests.sh deleted file mode 100644 index 6b4104e38..000000000 --- a/presets/inference/text-generation/tests/run_tests.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -echo "Running text-generation tests..." -python3 test_inference_api.py --pipeline text-generation --pretrained_model_name_or_path microsoft/phi-2 --allow_remote_files True - -echo "Running conversational tests..." -python3 test_inference_api.py --pipeline conversational --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True - -echo "Running invalid-pipeline tests..." -python3 test_inference_api.py --pipeline invalid-pipeline --pretrained_model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 --allow_remote_files True diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py index 4d752a831..d25a67ed3 100644 --- a/presets/inference/text-generation/tests/test_inference_api.py +++ b/presets/inference/text-generation/tests/test_inference_api.py @@ -1,78 +1,76 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. +import importlib import sys from pathlib import Path +from unittest.mock import patch + +import pytest +from fastapi.testclient import TestClient # Get the parent directory of the current file parent_dir = str(Path(__file__).resolve().parent.parent) # Add the parent directory to sys.path sys.path.append(parent_dir) -import argparse -from unittest.mock import patch +@pytest.fixture(params=[ + {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"}, + {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"}, +]) +def configured_app(request): + original_argv = sys.argv.copy() + # Use request.param to set correct test arguments for each configuration + test_args = [ + 'program_name', + '--pipeline', request.param['pipeline'], + '--pretrained_model_name_or_path', request.param['model_path'], + '--torch_dtype', request.param['torch_dtype'] + ] + sys.argv = test_args -from fastapi.testclient import TestClient + import inference_api + importlib.reload(inference_api) # Reload to prevent module caching + from inference_api import app -# Parse the command-line arguments -parser = argparse.ArgumentParser() -parser.add_argument("--pipeline", required=True, help="Pipeline type") -parser.add_argument("--pretrained_model_name_or_path", required=True, help="Model path") -parser.add_argument("--allow_remote_files", default=True, help="Allow models to be downloaded for tests") -args = parser.parse_args() -pipeline_type = args.pipeline + # Attach the request params to the app instance for access in tests + app.test_config = request.param + yield app -try: - from inference_api import ModelConfig, app -except ValueError as e: - if pipeline_type not in {"text-generation", "conversational"}: - # Pipeline is invalid, handle and exit - print(f"Correctly caught invalid pipeline during import") - sys.exit(0) - else: - raise -except Exception as e: - # For all other exceptions, re-raise - raise - -def run_tests(): - client = TestClient(app) - test_read_main(client) - test_health_check(client) - test_get_metrics(client) - test_get_metrics_no_gpus(client) - # Pipeline must be valid to pass import - if pipeline_type == "text-generation": - test_text_generation(client) - test_missing_prompt(client) - elif pipeline_type == "conversational": - test_conversational(client) - test_missing_messages_for_conversation(client) - -def test_read_main(client): - response = client.get("/") - server_msg, status_code = response.json() - assert server_msg == "Server is running" - assert status_code == 200 - -def test_health_check(client): - response = client.get("/healthz") - # Assume we have a GPU available and the model & pipeline initialized for testing - assert response.status_code == 200 - assert response.json() == {"status": "Healthy"} + sys.argv = original_argv + +def test_conversational(configured_app): + if configured_app.test_config['pipeline'] != 'conversational': + pytest.skip("Skipping non-conversational tests") + client = TestClient(configured_app) + messages = [ + {"role": "user", "content": "What is your favourite condiment?"}, + {"role": "assistant", "content": "Well, Im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever Im cooking up in the kitchen!"}, + {"role": "user", "content": "Do you have mayonnaise recipes?"} + ] + request_data = { + "messages": messages, + "generate_kwargs": {"max_new_tokens": 20, "do_sample": True} + } + response = client.post("/chat", json=request_data) -def test_get_metrics(client): - response = client.get("/metrics") assert response.status_code == 200 - # Check the structure of the response to ensure GPU metrics are returned - assert "gpu_info" in response.json() + data = response.json() + assert "Result" in data + assert len(data["Result"]) > 0 # Check if the conversation result is not empty -def test_get_metrics_no_gpus(client): - with patch('GPUtil.getGPUs', return_value=[]) as mock_getGPUs: - response = client.get("/metrics") - assert response.status_code == 200 - assert response.json()["gpu_info"] == [] # Expecting an empty list +def test_missing_messages_for_conversation(configured_app): + if configured_app.test_config['pipeline'] != 'conversational': + pytest.skip("Skipping non-conversational tests") + client = TestClient(configured_app) + request_data = { + # "messages" is missing for conversational pipeline + } + response = client.post("/chat", json=request_data) + assert response.status_code == 400 # Expecting a Bad Request response due to missing messages + assert "Conversational parameter messages required" in response.json().get("detail", "") -def test_text_generation(client): +def test_text_generation(configured_app): + if configured_app.test_config['pipeline'] != 'text-generation': + pytest.skip("Skipping non-text-generation tests") + client = TestClient(configured_app) request_data = { "prompt": "Hello, world!", "return_full_text": True, @@ -85,7 +83,10 @@ def test_text_generation(client): assert "Result" in data assert len(data["Result"]) > 0 # Check if the result text is not empty -def test_missing_prompt(client): +def test_missing_prompt(configured_app): + if configured_app.test_config['pipeline'] != 'text-generation': + pytest.skip("Skipping non-text-generation tests") + client = TestClient(configured_app) request_data = { # "prompt" is missing "return_full_text": True, @@ -95,31 +96,30 @@ def test_missing_prompt(client): response = client.post("/chat", json=request_data) assert response.status_code == 400 # Expecting a Bad Request response due to missing prompt assert "Text generation parameter prompt required" in response.json().get("detail", "") - -def test_conversational(client): - messages = [ - {"role": "user", "content": "What is your favourite condiment?"}, - {"role": "assistant", "content": "Well, Im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever Im cooking up in the kitchen!"}, - {"role": "user", "content": "Do you have mayonnaise recipes?"} - ] - request_data = { - "messages": messages, - "generate_kwargs": {"max_new_tokens": 1000, "do_sample": True} - } - response = client.post("/chat", json=request_data) +def test_read_main(configured_app): + client = TestClient(configured_app) + response = client.get("/") + server_msg, status_code = response.json() + assert server_msg == "Server is running" + assert status_code == 200 + +def test_health_check(configured_app): + client = TestClient(configured_app) + response = client.get("/healthz") + # Assume we have a GPU available and the model & pipeline initialized for testing assert response.status_code == 200 - data = response.json() - assert "Result" in data - assert len(data["Result"]) > 0 # Check if the conversation result is not empty + assert response.json() == {"status": "Healthy"} -def test_missing_messages_for_conversation(client): - request_data = { - # "messages" is missing for conversational pipeline - } - response = client.post("/chat", json=request_data) - assert response.status_code == 400 # Expecting a Bad Request response due to missing messages - assert "Conversational parameter messages required" in response.json().get("detail", "") +def test_get_metrics(configured_app): + client = TestClient(configured_app) + response = client.get("/metrics") + assert response.status_code == 200 + assert "gpu_info" in response.json() -if __name__ == "__main__": - run_tests() \ No newline at end of file +def test_get_metrics_no_gpus(configured_app): + client = TestClient(configured_app) + with patch('GPUtil.getGPUs', return_value=[]) as mock_getGPUs: + response = client.get("/metrics") + assert response.status_code == 200 + assert response.json()["gpu_info"] == [] diff --git a/presets/inference/text-generation/tests/test_model_config.py b/presets/inference/text-generation/tests/test_model_config.py new file mode 100644 index 000000000..d0662ba0e --- /dev/null +++ b/presets/inference/text-generation/tests/test_model_config.py @@ -0,0 +1,89 @@ +import importlib +import sys +from pathlib import Path + +import pytest + +# Get the parent directory of the current file +parent_dir = str(Path(__file__).resolve().parent.parent) +# Add the parent directory to sys.path +sys.path.append(parent_dir) + +@pytest.fixture(params=[ + {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"}, + {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"}, +]) +def configured_model_config(request): + original_argv = sys.argv.copy() + + sys.argv = [ + 'program_name', + '--pipeline', request.param['pipeline'], + '--pretrained_model_name_or_path', request.param['model_path'], + '--torch_dtype', request.param['torch_dtype'] + ] + + import inference_api + importlib.reload(inference_api) + from inference_api import ModelConfig + + # Create and configure the ModelConfig instance + model_config = ModelConfig( + pipeline=request.param['pipeline'], + pretrained_model_name_or_path=request.param['model_path'], + torch_dtype=request.param['torch_dtype'] + ) + + yield model_config + + # Restore the original sys.argv after the test is done + sys.argv = original_argv + +def test_process_additional_args(configured_model_config): + config = configured_model_config + + # Simulate additional command-line arguments + additional_args = [ + "--new_arg1", "value1", + "--new_arg2", + "--new_arg3", "value3", + "--flag_arg" + ] + + # Process the additional arguments + config.process_additional_args(additional_args) + + # Assertions to verify that additional arguments were processed correctly + assert getattr(config, "new_arg1", None) == "value1" + assert getattr(config, "new_arg2", None) is True + assert getattr(config, "new_arg3", None) == "value3" + assert getattr(config, "flag_arg", None) is True + +# Test case for ignoring arguments prefixed with '--' when expecting a value +def test_ignore_double_dash_arguments(configured_model_config): + config = configured_model_config + additional_args = [ + "--new_arg1", "--new_arg2", + "--new_arg3", "correct_value" + ] + + config.process_additional_args(additional_args) + + # new_arg1 should be set to True since its value is incorrectly prefixed with '--' + assert getattr(config, "new_arg1", None) is True + assert getattr(config, "new_arg2", None) is True + assert getattr(config, "new_arg3", None) == "correct_value" + +# Test case to verify handling unsupported pipeline values +def test_unsupported_pipeline_raises_value_error(configured_model_config): + with pytest.raises(ValueError) as excinfo: + from inference_api import ModelConfig + ModelConfig(pipeline="unsupported_pipeline") + assert "Unsupported pipeline" in str(excinfo.value) + +# Test case for validating torch_dtype +def test_invalid_torch_dtype_raises_value_error(configured_model_config): + with pytest.raises(ValueError) as excinfo: + from inference_api import ModelConfig + ModelConfig(pipeline="text-generation", torch_dtype="unsupported_dtype") + assert "Invalid torch dtype" in str(excinfo.value) \ No newline at end of file From fcb3f7d63f0fa24b2c1c33ce77775f70bf099bcf Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 13 Feb 2024 15:38:53 -0800 Subject: [PATCH 11/24] fix: typo --- presets/inference/text-generation/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 46e964b23..6cfc08e21 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -9,5 +9,5 @@ bitsandbytes==0.42.0 deepspeed==0.11.1 gputil==1.4.0 # For UTs -pytest=8.0.0 -httpx=0.26.0 \ No newline at end of file +pytest==8.0.0 +httpx==0.26.0 \ No newline at end of file From fb3e7c24c5b9149e02d2420b5d4a3d10edd732c1 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 13 Feb 2024 16:18:49 -0800 Subject: [PATCH 12/24] fix: add git install --- docker/presets/llama-2/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 5d888cc29..c065f59e2 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -7,6 +7,12 @@ FROM python:3.8-slim WORKDIR /workspace +# Install git +RUN apt-get update && \ + apt-get install -y git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + RUN git clone https://github.com/facebookresearch/llama WORKDIR /workspace/llama From 7758ba01f8d259e1c68579ab1ac9e60414beed76 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 13 Feb 2024 19:28:14 -0800 Subject: [PATCH 13/24] fix: sed using bash --- docker/presets/llama-2/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index c065f59e2..9b7f3d394 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -17,7 +17,8 @@ RUN git clone https://github.com/facebookresearch/llama WORKDIR /workspace/llama -RUN sed -i $'/torch.distributed.init_process_group("nccl")/c\\ import datetime\\\n torch.distributed.init_process_group("nccl", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py +# RUN sed -i $'/torch.distributed.init_process_group("nccl")/c\\ import datetime\\\n torch.distributed.init_process_group("nccl", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py +RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\ import datetime\\\n torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"] RUN pip install -e . RUN pip install torch==2.2.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0 From 77c7ed8b082185aa92d9f8cbabf88bdc8c89b7e0 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 13 Feb 2024 20:49:58 -0800 Subject: [PATCH 14/24] fix: add unit tests for cpp --- Makefile | 2 ++ .../text-generation/tests/test_inference_api.py | 12 ++++++++---- .../text-generation/tests/test_model_config.py | 7 +++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index bd92718b1..fd9c5ca54 100644 --- a/Makefile +++ b/Makefile @@ -82,6 +82,8 @@ fmt: ## Run go fmt against code. unit-test: ## Run unit tests. go test -v $(shell go list ./pkg/... ./api/... | grep -v /vendor) -race -coverprofile=coverage.txt -covermode=atomic go tool cover -func=coverage.txt + pip install -r presets/inference/text-generation/requirements.txt + pytest -o log_cli=true -o log_cli_level=INFO . $(E2E_TEST): (cd test/e2e && go test -c . -o $(E2E_TEST)) diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py index d25a67ed3..ff9866bbb 100644 --- a/presets/inference/text-generation/tests/test_inference_api.py +++ b/presets/inference/text-generation/tests/test_inference_api.py @@ -4,6 +4,7 @@ from unittest.mock import patch import pytest +import torch from fastapi.testclient import TestClient # Get the parent directory of the current file @@ -12,8 +13,8 @@ sys.path.append(parent_dir) @pytest.fixture(params=[ - {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"}, - {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"}, + {"pipeline": "text-generation", "model_path": "stanford-crfm/alias-gpt2-small-x21"}, + {"pipeline": "conversational", "model_path": "stanford-crfm/alias-gpt2-small-x21"}, ]) def configured_app(request): original_argv = sys.argv.copy() @@ -22,7 +23,7 @@ def configured_app(request): 'program_name', '--pipeline', request.param['pipeline'], '--pretrained_model_name_or_path', request.param['model_path'], - '--torch_dtype', request.param['torch_dtype'] + '--allow_remote_files', 'True' ] sys.argv = test_args @@ -105,9 +106,12 @@ def test_read_main(configured_app): assert status_code == 200 def test_health_check(configured_app): + device = "GPU" if torch.cuda.is_available() else "CPU" + if device != "GPU": + pytest.skip("Skipping healthz endpoint check - running on CPU") client = TestClient(configured_app) response = client.get("/healthz") - # Assume we have a GPU available and the model & pipeline initialized for testing + # Assuming we have a GPU available assert response.status_code == 200 assert response.json() == {"status": "Healthy"} diff --git a/presets/inference/text-generation/tests/test_model_config.py b/presets/inference/text-generation/tests/test_model_config.py index d0662ba0e..df5b98e8d 100644 --- a/presets/inference/text-generation/tests/test_model_config.py +++ b/presets/inference/text-generation/tests/test_model_config.py @@ -10,8 +10,8 @@ sys.path.append(parent_dir) @pytest.fixture(params=[ - {"pipeline": "text-generation", "model_path": "microsoft/phi-2", "torch_dtype": "bfloat16"}, - {"pipeline": "conversational", "model_path": "mistralai/Mistral-7B-Instruct-v0.2", "torch_dtype": "bfloat16"}, + {"pipeline": "text-generation", "model_path": "stanford-crfm/alias-gpt2-small-x21"}, + {"pipeline": "conversational", "model_path": "stanford-crfm/alias-gpt2-small-x21"}, ]) def configured_model_config(request): original_argv = sys.argv.copy() @@ -20,7 +20,7 @@ def configured_model_config(request): 'program_name', '--pipeline', request.param['pipeline'], '--pretrained_model_name_or_path', request.param['model_path'], - '--torch_dtype', request.param['torch_dtype'] + '--allow_remote_files', 'True' ] import inference_api @@ -31,7 +31,6 @@ def configured_model_config(request): model_config = ModelConfig( pipeline=request.param['pipeline'], pretrained_model_name_or_path=request.param['model_path'], - torch_dtype=request.param['torch_dtype'] ) yield model_config From 56cacc6a96aa88f8c0339f6304db66863621b0d7 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 13 Feb 2024 21:15:28 -0800 Subject: [PATCH 15/24] fix: need to rebuild models --- presets/models/supported_models.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index d5e4fea1f..f4fa1a87c 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -3,58 +3,58 @@ models: - name: llama-2-7b type: llama2-completion runtime: llama-2 - tag: 0.0.1 + tag: 0.0.2 - name: llama-2-7b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.1 + tag: 0.0.2 - name: llama-2-13b type: llama2-completion runtime: llama-2 - tag: 0.0.1 + tag: 0.0.2 - name: llama-2-13b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.1 + tag: 0.0.2 - name: llama-2-70b type: llama2-completion runtime: llama-2 - tag: 0.0.1 + tag: 0.0.2 - name: llama-2-70b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.1 + tag: 0.0.2 # Falcon - name: falcon-7b type: text-generation version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - tag: 0.0.1 + tag: 0.0.2 - name: falcon-7b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - tag: 0.0.1 + tag: 0.0.2 - name: falcon-40b type: text-generation version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - tag: 0.0.1 + tag: 0.0.2 - name: falcon-40b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - tag: 0.0.1 + tag: 0.0.2 # Mistral - name: mistral-7b type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24 runtime: tfs - tag: 0.0.1 + tag: 0.0.2 - name: mistral-7b-instruct type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/commit/9ab9e76e2b09f9f29ea2d56aa5bd139e4445c59e runtime: tfs - tag: 0.0.1 + tag: 0.0.2 From cfe9ac7e3341938c93fa79210a4216274213edcd Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Wed, 14 Feb 2024 16:01:04 -0800 Subject: [PATCH 16/24] fix: makefile edit for unit tests --- .github/workflows/e2e-preset-test.yml | 2 +- .github/workflows/preset-image-build.yml | 3 ++- .github/workflows/tests.yml | 4 ++++ Makefile | 2 ++ docker/presets/llama-2/Dockerfile | 1 - 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 641ac2ab1..db96dcec2 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -33,7 +33,7 @@ jobs: - name: Determine Affected Models id: affected_models run: | - PR_BRANCH=${{ github.head_ref }} \ + PR_BRANCH=${{ github.ref_name }} \ python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 42ef6c7a6..fd86ac741 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -118,5 +118,6 @@ jobs: if: ${{ always() }} run: | kubectl get job --no-headers -o custom-columns=":metadata.name" | grep "^docker-build-job-${{ matrix.model.name }}-[0-9]" | xargs -r kubectl delete job - kubectl get pods --no-headers -o custom-columns=":metadata.name" | grep "^docker-build-job-${{ matrix.model.name }}-[0-9]" | xargs -r kubectl delete pod + # Job deletion above deletes associated pods + # kubectl get pods --no-headers -o custom-columns=":metadata.name" | grep "^docker-build-job-${{ matrix.model.name }}-[0-9]" | xargs -r kubectl delete pod diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5b04b414d..29d14ac24 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,6 +38,10 @@ jobs: run: | make unit-test + - name: Run inference api unit tests + run: | + make inference-api-e2e + - name: Upload Codecov report uses: codecov/codecov-action@v4 with: diff --git a/Makefile b/Makefile index fd9c5ca54..1a5445c48 100644 --- a/Makefile +++ b/Makefile @@ -82,6 +82,8 @@ fmt: ## Run go fmt against code. unit-test: ## Run unit tests. go test -v $(shell go list ./pkg/... ./api/... | grep -v /vendor) -race -coverprofile=coverage.txt -covermode=atomic go tool cover -func=coverage.txt + +inference-api-e2e: pip install -r presets/inference/text-generation/requirements.txt pytest -o log_cli=true -o log_cli_level=INFO . diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 9b7f3d394..103afb297 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -17,7 +17,6 @@ RUN git clone https://github.com/facebookresearch/llama WORKDIR /workspace/llama -# RUN sed -i $'/torch.distributed.init_process_group("nccl")/c\\ import datetime\\\n torch.distributed.init_process_group("nccl", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\ import datetime\\\n torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"] RUN pip install -e . From 29ddf2e2e6cf328a2b17dbc839b6e4fd55835b11 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Wed, 14 Feb 2024 16:05:28 -0800 Subject: [PATCH 17/24] nit: add quotes --- .github/workflows/e2e-preset-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index db96dcec2..0e65a5591 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -65,7 +65,7 @@ jobs: # COMBINED_MATRIX.append(combined) # break - COMBINED_MATRIX=$(echo ${{ steps.affected_models.outputs.matrix }} | jq --argjson configs "$CONFIGS" -c ' + COMBINED_MATRIX=$(echo '${{ steps.affected_models.outputs.matrix }}' | jq --argjson configs "$CONFIGS" -c ' map(. as $model | $configs[] | select(.name == $model.name) | $model + .) ') From f05f56582854323c87a0d3122a33deda386ef5d1 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Wed, 14 Feb 2024 17:56:17 -0800 Subject: [PATCH 18/24] fix: pin torch version and increase e2e preset timeout --- .github/workflows/e2e-preset-test.yml | 2 +- docker/presets/llama-2/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 0e65a5591..3003e352f 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -252,7 +252,7 @@ jobs: - name: Wait for Resource to be ready if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' run: | - kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} + kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s - name: Test home endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index 103afb297..8957ef749 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -20,7 +20,7 @@ WORKDIR /workspace/llama RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\ import datetime\\\n torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"] RUN pip install -e . -RUN pip install torch==2.2.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0 +RUN pip install torch==2.1.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0 RUN pip install 'uvicorn[standard]' ARG WEIGHTS_PATH From 53eccf19d864d5d683c0ef7a22c42e675d3e0f9b Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Sun, 18 Feb 2024 23:09:45 -0800 Subject: [PATCH 19/24] feat: update requirements --- docker/presets/tfs/Dockerfile | 2 +- .../text-generation/requirements.txt | 2 +- presets/models/supported_models.yaml | 21 ++++++++++++------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/docker/presets/tfs/Dockerfile b/docker/presets/tfs/Dockerfile index 9ddcf9b93..5a322b8bd 100644 --- a/docker/presets/tfs/Dockerfile +++ b/docker/presets/tfs/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8-slim +FROM python:3.10-slim ARG WEIGHTS_PATH ARG MODEL_TYPE diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 6cfc08e21..8a7c50dbe 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -1,5 +1,5 @@ # Dependencies for TFS -transformers==4.36.0 +transformers==4.37.2 torch==2.2.0 accelerate==0.23.0 fastapi==0.109.1 diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index f4fa1a87c..0ce52b1e7 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -3,27 +3,27 @@ models: - name: llama-2-7b type: llama2-completion runtime: llama-2 - tag: 0.0.2 + tag: 0.0.3 - name: llama-2-7b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.2 + tag: 0.0.3 - name: llama-2-13b type: llama2-completion runtime: llama-2 - tag: 0.0.2 + tag: 0.0.3 - name: llama-2-13b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.2 + tag: 0.0.3 - name: llama-2-70b type: llama2-completion runtime: llama-2 - tag: 0.0.2 + tag: 0.0.3 - name: llama-2-70b-chat type: llama2-chat runtime: llama-2 - tag: 0.0.2 + tag: 0.0.3 # Falcon - name: falcon-7b @@ -55,6 +55,13 @@ models: tag: 0.0.2 - name: mistral-7b-instruct type: text-generation - version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/commit/9ab9e76e2b09f9f29ea2d56aa5bd139e4445c59e + version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61 runtime: tfs tag: 0.0.2 + + # Phi-2 + - name: phi-2 + type: text-generation + version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 + runtime: tfs + tag: 0.0.1 \ No newline at end of file From f6ec3d3cab2456bcacc0abee6070deebffb48a01 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 19 Feb 2024 14:36:30 -0800 Subject: [PATCH 20/24] fix: include rename in different PR --- pkg/inference/preset-inferences.go | 2 +- pkg/inference/preset-inferences_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go index 9b02012b7..7d49bfece 100644 --- a/pkg/inference/preset-inferences.go +++ b/pkg/inference/preset-inferences.go @@ -21,7 +21,7 @@ import ( const ( ProbePath = "/healthz" Port5000 = int32(5000) - InferenceFile = "inference_api.py" + InferenceFile = "inference-api.py" DefaultVolumeMountPath = "/dev/shm" ) diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go index cd8df067c..bb73b7894 100644 --- a/pkg/inference/preset-inferences_test.go +++ b/pkg/inference/preset-inferences_test.go @@ -37,7 +37,7 @@ func TestCreatePresetInference(t *testing.T) { workload: "Deployment", // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams // So expected cmd consists of shell command and inference file - expectedCmd: "/bin/sh -c inference_api.py", + expectedCmd: "/bin/sh -c inference-api.py", }, "test-distributed-model": { @@ -48,7 +48,7 @@ func TestCreatePresetInference(t *testing.T) { c.On("Create", mock.IsType(context.Background()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil) }, workload: "StatefulSet", - expectedCmd: "/bin/sh -c inference_api.py", + expectedCmd: "/bin/sh -c inference-api.py", }, } From 8133e020208f8ec8ea86b79301029a6614b0734a Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 20 Feb 2024 15:54:14 -0800 Subject: [PATCH 21/24] nit: gpu change --- presets/test/manifests/phi-2/phi-2.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presets/test/manifests/phi-2/phi-2.yaml b/presets/test/manifests/phi-2/phi-2.yaml index 9882b80cb..15a7fbde2 100644 --- a/presets/test/manifests/phi-2/phi-2.yaml +++ b/presets/test/manifests/phi-2/phi-2.yaml @@ -21,9 +21,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs + nvidia.com/gpu: 1 # Requesting 1 GPUs livenessProbe: httpGet: path: /healthz From 852776240e1936fafd60b11f8037838eb1071cc3 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 20 Feb 2024 16:17:30 -0800 Subject: [PATCH 22/24] fix: phi-2 needs more memory --- .github/e2e-preset-configs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index a499d28d6..e3cbbb8b9 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -41,7 +41,7 @@ "name": "phi-2", "node-count": 1, "node-vm-size": "Standard_NC6s_v3", - "node-osdisk-size": 30 + "node-osdisk-size": 50 }, { "name": "llama-2-7b", From 0e428fd91bd12a731ba24fe6a7d87178e7e5026a Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 20 Feb 2024 22:41:44 -0800 Subject: [PATCH 23/24] fix: increase timeout --- .../test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml | 1 + presets/test/manifests/falcon-40b/falcon-40b.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml index 04c49b6ce..8dd56c6a6 100644 --- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml +++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml @@ -12,6 +12,7 @@ spec: labels: app: falcon spec: + progressDeadlineSeconds: 1800 containers: - name: falcon-container image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml index 96610a9fb..5ca9716a0 100644 --- a/presets/test/manifests/falcon-40b/falcon-40b.yaml +++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml @@ -12,6 +12,7 @@ spec: labels: app: falcon spec: + progressDeadlineSeconds: 1800 containers: - name: falcon-container image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE From e1a6e41b4b20c038f29f0742218ac9318e58b827 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 20 Feb 2024 22:56:46 -0800 Subject: [PATCH 24/24] fix: increase timeout --- .../test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml | 2 +- presets/test/manifests/falcon-40b/falcon-40b.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml index 8dd56c6a6..226a485a7 100644 --- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml +++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml @@ -3,6 +3,7 @@ kind: Deployment metadata: name: falcon-40b-instruct spec: + progressDeadlineSeconds: 1800 replicas: 1 selector: matchLabels: @@ -12,7 +13,6 @@ spec: labels: app: falcon spec: - progressDeadlineSeconds: 1800 containers: - name: falcon-container image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml index 5ca9716a0..a4cb2d524 100644 --- a/presets/test/manifests/falcon-40b/falcon-40b.yaml +++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml @@ -3,6 +3,7 @@ kind: Deployment metadata: name: falcon-40b spec: + progressDeadlineSeconds: 1800 replicas: 1 selector: matchLabels: @@ -12,7 +13,6 @@ spec: labels: app: falcon spec: - progressDeadlineSeconds: 1800 containers: - name: falcon-container image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE