From d61d43e4e4f20268af415d71b8e946fe0112a4d1 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Sat, 21 Oct 2023 14:06:08 -0700 Subject: [PATCH] Update README.md (vicuna-v1.3 -> vicuna-1.5) (#2592) --- README.md | 24 ++++++++++++------------ docker/docker-compose.yml | 2 +- docs/langchain_integration.md | 2 +- docs/model_support.md | 4 ++-- docs/openai_api.md | 14 +++++++------- docs/vllm_integration.md | 4 ++-- fastchat/llm_judge/README.md | 2 +- fastchat/model/model_adapter.py | 4 ++-- fastchat/serve/cli.py | 2 +- fastchat/serve/huggingface_api.py | 2 +- fastchat/serve/launch_all_serve.py | 2 +- fastchat/serve/vllm_worker.py | 2 +- scripts/train_lora.sh | 2 +- tests/test_cli.py | 2 +- tests/test_openai_langchain.py | 2 +- 15 files changed, 35 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 6feb2d9bd..9d72de702 100644 --- a/README.md +++ b/README.md @@ -110,13 +110,13 @@ The command below requires around 14GB of GPU memory for Vicuna-7B and 28GB of G See the ["Not Enough Memory" section](#not-enough-memory) below if you do not have enough memory. `--model-path` can be a local folder or a Hugging Face repo name. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 ``` #### Multiple GPUs You can use model parallelism to aggregate GPU memory from multiple GPUs on the same machine. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --num-gpus 2 +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --num-gpus 2 ``` Tips: @@ -125,25 +125,25 @@ You can use `--max-gpu-memory` to specify the maximum memory per GPU for storing This allows it to allocate more memory for activations, so you can use longer context lengths or larger batch sizes. For example, ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --num-gpus 2 --max-gpu-memory 8GiB +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --num-gpus 2 --max-gpu-memory 8GiB ``` #### CPU Only This runs on the CPU only and does not require GPU. It requires around 30GB of CPU memory for Vicuna-7B and around 60GB of CPU memory for Vicuna-13B. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device cpu +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --device cpu ``` Use Intel AI Accelerator AVX512_BF16/AMX to accelerate CPU inference. ``` -CPU_ISA=amx python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device cpu +CPU_ISA=amx python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --device cpu ``` #### Metal Backend (Mac Computers with Apple Silicon or AMD GPUs) Use `--device mps` to enable GPU acceleration on Mac computers (requires torch >= 2.0). Use `--load-8bit` to turn on 8-bit compression. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device mps --load-8bit +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --device mps --load-8bit ``` Vicuna-7B can run on a 32GB M1 Macbook with 1 - 2 words / second. @@ -155,7 +155,7 @@ source /opt/intel/oneapi/setvars.sh Use `--device xpu` to enable XPU/GPU acceleration. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device xpu +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --device xpu ``` Vicuna-7B can run on an Intel Arc A770 16GB. @@ -167,7 +167,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh Use `--device npu` to enable NPU acceleration. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device npu +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --device npu ``` Vicuna-7B/13B can run on an Ascend 910B NPU 60GB. @@ -179,7 +179,7 @@ It is compatible with the CPU, GPU, and Metal backend. Vicuna-13B with 8-bit compression can run on a single GPU with 16 GB of VRAM, like an Nvidia RTX 3090, RTX 4080, T4, V100 (16GB), or an AMD RX 6800 XT. ``` -python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --load-8bit +python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5 --load-8bit ``` In addition to that, you can add `--cpu-offloading` to commands above to offload weights that don't fit on your GPU onto the CPU memory. @@ -209,13 +209,13 @@ This controller manages the distributed workers. #### Launch the model worker(s) ```bash -python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.5 ``` Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller . To ensure that your model worker is connected to your controller properly, send a test message using the following command: ```bash -python3 -m fastchat.serve.test_message --model-name vicuna-7b-v1.3 +python3 -m fastchat.serve.test_message --model-name vicuna-7b-v1.5 ``` You will see a short output. @@ -233,7 +233,7 @@ If the models do not show up, try to reboot the gradio web server. - You can register multiple model workers to a single controller, which can be used for serving a single model with higher throughput or serving multiple models at the same time. When doing so, please allocate different GPUs and ports for different model workers. ``` # worker 0 -CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3 --controller http://localhost:21001 --port 31000 --worker http://localhost:31000 +CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.5 --controller http://localhost:21001 --port 31000 --worker http://localhost:31000 # worker 1 CUDA_VISIBLE_DEVICES=1 python3 -m fastchat.serve.model_worker --model-path lmsys/fastchat-t5-3b-v1.0 --controller http://localhost:21001 --port 31001 --worker http://localhost:31001 ``` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 64a7f60fd..113e0c7a3 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -23,7 +23,7 @@ services: - driver: nvidia count: 1 capabilities: [gpu] - entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", "--model-names", "${FASTCHAT_WORKER_MODEL_NAMES:-vicuna-7b-v1.3}", "--model-path", "${FASTCHAT_WORKER_MODEL_PATH:-lmsys/vicuna-7b-v1.3}", "--worker-address", "http://fastchat-model-worker:21002", "--controller-address", "http://fastchat-controller:21001", "--host", "0.0.0.0", "--port", "21002"] + entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", "--model-names", "${FASTCHAT_WORKER_MODEL_NAMES:-vicuna-7b-v1.5}", "--model-path", "${FASTCHAT_WORKER_MODEL_PATH:-lmsys/vicuna-7b-v1.5}", "--worker-address", "http://fastchat-model-worker:21002", "--controller-address", "http://fastchat-controller:21001", "--host", "0.0.0.0", "--port", "21002"] fastchat-api-server: build: context: . diff --git a/docs/langchain_integration.md b/docs/langchain_integration.md index a59d739ab..50174a85e 100644 --- a/docs/langchain_integration.md +++ b/docs/langchain_integration.md @@ -19,7 +19,7 @@ Here, we use Vicuna as an example and use it for three endpoints: chat completio See a full list of supported models [here](../README.md#supported-models). ```bash -python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path lmsys/vicuna-7b-v1.5 ``` Finally, launch the RESTful API server diff --git a/docs/model_support.md b/docs/model_support.md index 745c6b646..24f3bc9cc 100644 --- a/docs/model_support.md +++ b/docs/model_support.md @@ -5,7 +5,7 @@ - [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - example: `python3 -m fastchat.serve.cli --model-path meta-llama/Llama-2-7b-chat-hf` - Vicuna, Alpaca, LLaMA, Koala - - example: `python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3` + - example: `python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5` - [BAAI/AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) - [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en#using-huggingface-transformers) - [baichuan-inc/baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) @@ -67,7 +67,7 @@ python3 -m fastchat.serve.cli --model [YOUR_MODEL_PATH] You can run this example command to learn the code logic. ``` -python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.5 ``` You can add `--debug` to see the actual prompt sent to the model. diff --git a/docs/openai_api.md b/docs/openai_api.md index 0c555a60e..f3c0fba93 100644 --- a/docs/openai_api.md +++ b/docs/openai_api.md @@ -18,7 +18,7 @@ python3 -m fastchat.serve.controller Then, launch the model worker(s) ```bash -python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.5 ``` Finally, launch the RESTful API server @@ -45,7 +45,7 @@ import openai openai.api_key = "EMPTY" openai.api_base = "http://localhost:8000/v1" -model = "vicuna-7b-v1.3" +model = "vicuna-7b-v1.5" prompt = "Once upon a time" # create a completion @@ -77,7 +77,7 @@ Chat Completions: curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "vicuna-7b-v1.3", + "model": "vicuna-7b-v1.5", "messages": [{"role": "user", "content": "Hello! What is your name?"}] }' ``` @@ -87,7 +87,7 @@ Text Completions: curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "vicuna-7b-v1.3", + "model": "vicuna-7b-v1.5", "prompt": "Once upon a time", "max_tokens": 41, "temperature": 0.5 @@ -99,7 +99,7 @@ Embeddings: curl http://localhost:8000/v1/embeddings \ -H "Content-Type: application/json" \ -d '{ - "model": "vicuna-7b-v1.3", + "model": "vicuna-7b-v1.5", "input": "Hello world!" }' ``` @@ -111,8 +111,8 @@ you can replace the `model_worker` step above with a multi model variant: ```bash python3 -m fastchat.serve.multi_model_worker \ - --model-path lmsys/vicuna-7b-v1.3 \ - --model-names vicuna-7b-v1.3 \ + --model-path lmsys/vicuna-7b-v1.5 \ + --model-names vicuna-7b-v1.5 \ --model-path lmsys/longchat-7b-16k \ --model-names longchat-7b-16k ``` diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md index e371ef32f..7d3205bb8 100644 --- a/docs/vllm_integration.md +++ b/docs/vllm_integration.md @@ -11,12 +11,12 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup 2. When you launch a model worker, replace the normal worker (`fastchat.serve.model_worker`) with the vLLM worker (`fastchat.serve.vllm_worker`). All other commands such as controller, gradio web server, and OpenAI API server are kept the same. ``` - python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 + python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.5 ``` If you see tokenizer errors, try ``` - python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 --tokenizer hf-internal-testing/llama-tokenizer + python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.5 --tokenizer hf-internal-testing/llama-tokenizer ``` If you use an AWQ quantized model, try diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md index f1755e3e5..1d2646b13 100644 --- a/fastchat/llm_judge/README.md +++ b/fastchat/llm_judge/README.md @@ -49,7 +49,7 @@ Arguments: e.g., ``` -python gen_model_answer.py --model-path lmsys/vicuna-7b-v1.3 --model-id vicuna-7b-v1.3 +python gen_model_answer.py --model-path lmsys/vicuna-7b-v1.5 --model-id vicuna-7b-v1.5 ``` The answers will be saved to `data/mt_bench/model_answer/[MODEL-ID].jsonl`. diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index d1759df51..7c1ed844f 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -384,7 +384,7 @@ def add_model_args(parser): parser.add_argument( "--model-path", type=str, - default="lmsys/vicuna-7b-v1.3", + default="lmsys/vicuna-7b-v1.5", help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", ) parser.add_argument( @@ -572,7 +572,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: class VicunaAdapter(BaseModelAdapter): - "Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.3)" "" + "Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" "" use_fast_tokenizer = False diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py index dbaf9bee9..eba4d0043 100644 --- a/fastchat/serve/cli.py +++ b/fastchat/serve/cli.py @@ -2,7 +2,7 @@ Chat with a model with command line interface. Usage: -python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.5 python3 -m fastchat.serve.cli --model lmsys/fastchat-t5-3b-v1.0 Other commands: diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py index 5a4c30fec..2a49bf5f1 100644 --- a/fastchat/serve/huggingface_api.py +++ b/fastchat/serve/huggingface_api.py @@ -2,7 +2,7 @@ Use FastChat with Hugging Face generation APIs. Usage: -python3 -m fastchat.serve.huggingface_api --model lmsys/vicuna-7b-v1.3 +python3 -m fastchat.serve.huggingface_api --model lmsys/vicuna-7b-v1.5 python3 -m fastchat.serve.huggingface_api --model lmsys/fastchat-t5-3b-v1.0 """ import argparse diff --git a/fastchat/serve/launch_all_serve.py b/fastchat/serve/launch_all_serve.py index 1952cfb17..2f4ad7b0b 100644 --- a/fastchat/serve/launch_all_serve.py +++ b/fastchat/serve/launch_all_serve.py @@ -54,7 +54,7 @@ parser.add_argument( "--model-path", type=str, - default="lmsys/vicuna-7b-v1.3", + default="lmsys/vicuna-7b-v1.5", help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", ) parser.add_argument( diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py index eb0bfe26a..be247afa1 100644 --- a/fastchat/serve/vllm_worker.py +++ b/fastchat/serve/vllm_worker.py @@ -205,7 +205,7 @@ async def api_model_details(request: Request): parser.add_argument( "--controller-address", type=str, default="http://localhost:21001" ) - parser.add_argument("--model-path", type=str, default="lmsys/vicuna-7b-v1.3") + parser.add_argument("--model-path", type=str, default="lmsys/vicuna-7b-v1.5") parser.add_argument( "--model-names", type=lambda s: s.split(","), diff --git a/scripts/train_lora.sh b/scripts/train_lora.sh index 62648f40d..d30caad41 100644 --- a/scripts/train_lora.sh +++ b/scripts/train_lora.sh @@ -1,5 +1,5 @@ deepspeed fastchat/train/train_lora.py \ - --model_name_or_path lmsys/vicuna-7b-v1.3 \ + --model_name_or_path lmsys/vicuna-7b-v1.5 \ --lora_r 8 \ --lora_alpha 16 \ --lora_dropout 0.05 \ diff --git a/tests/test_cli.py b/tests/test_cli.py index dcefa4bbe..113e497a4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -69,7 +69,7 @@ def test_8bit(): def test_hf_api(): models = [ - "lmsys/vicuna-7b-v1.3", + "lmsys/vicuna-7b-v1.5", "lmsys/fastchat-t5-3b-v1.0", ] diff --git a/tests/test_openai_langchain.py b/tests/test_openai_langchain.py index 3efa50322..b9c07fcf6 100644 --- a/tests/test_openai_langchain.py +++ b/tests/test_openai_langchain.py @@ -1,5 +1,5 @@ # Usage: -# python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3 --model-names gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002 +# python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.5 --model-names gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002 # export OPENAI_API_BASE=http://localhost:8000/v1 # export OPENAI_API_KEY=EMPTY # wget https://raw.githubusercontent.com/hwchase17/langchain/v0.0.200/docs/modules/state_of_the_union.txt