From 9b6b8e9f163b51534e2c0d8211f42e48300014b2 Mon Sep 17 00:00:00 2001 From: Adam Bouhenguel Date: Mon, 4 Dec 2023 20:13:59 -0500 Subject: [PATCH] llamafile 0.2.1, model images use server --- Dockerfile | 20 ++++++++++---------- README.md | 10 ++++++---- docker-compose.yml | 22 ++++++++++++++++++++++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8b35ed6..bbe7d43 100644 --- a/Dockerfile +++ b/Dockerfile @@ -108,14 +108,17 @@ ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile"] FROM cosmos-scratch as llamafile-gguf LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos -ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main +ADD --checksum=sha256:c7151d48677e352e492731bd999d9d74c792fa1440715a858dbf3b92ee274abe --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-main-0.2.1 /usr/bin/llamafile-main +ADD --checksum=sha256:2b3c692e50d903cbf6ac3d8908f8394101b5be5f8a4573b472975fa8c9f09e68 --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-server-0.2.1 /usr/bin/llamafile-server ARG GGUF_URL ARG GGUF_CHECKSUM ADD --checksum=${GGUF_CHECKSUM} --chmod=0755 ${GGUF_URL} /model.gguf -ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile-main", "-m", "/model.gguf"] +EXPOSE 8080 +ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile-server", "-m", "/model.gguf", "--port", "8080", "--host", "0.0.0.0", "--nobrowser"] FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as devel-llamafile -ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main +ADD --checksum=sha256:c7151d48677e352e492731bd999d9d74c792fa1440715a858dbf3b92ee274abe --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-main-0.2.1 /usr/bin/llamafile-main +ADD --checksum=sha256:2b3c692e50d903cbf6ac3d8908f8394101b5be5f8a4573b472975fa8c9f09e68 --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-server-0.2.1 /usr/bin/llamafile-server # HACK we need to assimilate so this can run on github actions... COPY --from=unpack-cosmos /usr/bin/assimilate /usr/bin/ RUN /usr/bin/assimilate -c /usr/bin/llamafile-main @@ -134,11 +137,6 @@ COPY --from=devel-llamafile /root/.llamafile /root/.llamafile ENV PATH=/bin:/usr/bin ENV HOME=/root ENV LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib:/lib:/lib64 -# HACK forge an executable nvcc, because llamafile needs to find nvcc before looking for cached .cosmo and .llamafile files -COPY --from=unpack-cosmos /bin/chmod /bin/ -WORKDIR /usr/local/cuda/bin/ -RUN printf "" >nvcc -RUN chmod 0755 nvcc # HACK things seem to fail if we have multiple CUDA devices. limit ourselves to one device for now to avoid errors like: # > CUDA error 2 at /root/.llamafile/ggml-cuda.cu:7864: out of memory # > current device: 4 @@ -155,10 +153,12 @@ ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS FROM llamafile-cuda-scratch as llamafile-gguf-cuda LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos -ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main +ADD --checksum=sha256:c7151d48677e352e492731bd999d9d74c792fa1440715a858dbf3b92ee274abe --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-main-0.2.1 /usr/bin/llamafile-main +ADD --checksum=sha256:2b3c692e50d903cbf6ac3d8908f8394101b5be5f8a4573b472975fa8c9f09e68 --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-server-0.2.1 /usr/bin/llamafile-server ARG GGUF_URL ARG GGUF_CHECKSUM ADD --checksum=${GGUF_CHECKSUM} --chmod=0755 ${GGUF_URL} /model.gguf ARG LLAMAFILE_N_GPU_LAYERS=35 ENV LLAMAFILE_N_GPU_LAYERS=${LLAMAFILE_N_GPU_LAYERS} -ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS", "sh", "/usr/bin/llamafile-main", "-m", "/model.gguf"] +EXPOSE 8080 +ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS", "sh", "/usr/bin/llamafile-server", "-m", "/model.gguf", "--port", "8080", "--host", "0.0.0.0", "--nobrowser"] diff --git a/README.md b/README.md index 6eda8b6..93d5a8a 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,11 @@ docker compose run --build --rm -it python docker compose run --build --rm -it lua docker compose run --build --rm -it sqlite3 docker compose run --build --rm -it qjs -docker compose run --build --rm -it mistral-7b-instruct-v0.1-q4_k_m-cuda -docker compose run --build --rm -it mistral-7b-instruct-v0.1-q4_k_m -docker compose run --build --rm -it llava-v1.5-7b-q4_k-cuda -docker compose run --build --rm -it llava-v1.5-7b-q4_k +docker compose run --build --service-ports --rm -it mistral-7b-instruct-v0.1-q4_k_m-cuda +docker compose run --build --service-ports --rm -it mistral-7b-instruct-v0.1-q4_k_m +docker compose run --build --service-ports --rm -it llava-v1.5-7b-q4_k-cuda +docker compose run --build --service-ports --rm -it llava-v1.5-7b-q4_k +docker compose run --build --service-ports --rm -it airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cuda +docker compose run --build --service-ports --rm -it airoboros-m-7b-3.1.2-dare-0.85.q4_k_m ``` diff --git a/docker-compose.yml b/docker-compose.yml index 9164113..c4856a0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,6 +54,7 @@ services: llava-v1.5-7b-q4_k-cuda: image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cuda-12.1.1-cosmo-3.1.1 deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}} + ports: ["8080:8080"] build: dockerfile: Dockerfile target: llamafile-gguf-cuda @@ -63,9 +64,30 @@ services: LLAMAFILE_N_GPU_LAYERS: 35 llava-v1.5-7b-q4_k: image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cosmo-3.1.1 + ports: ["8080:8080"] build: dockerfile: Dockerfile target: llamafile-gguf args: GGUF_URL: https://huggingface.co/jartine/llava-v1.5-7B-GGUF/resolve/main/llava-v1.5-7b-Q4_K.gguf?download=true GGUF_CHECKSUM: sha256:c91ebf0a628ceb25e374df23ad966cc1bf1514b33fecf4f0073f9619dec5b3f9 + airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cuda: + image: ghcr.io/ajbouh/cosmos:airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cosmo-3.1.1 + deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}} + ports: ["8080:8080"] + build: + dockerfile: Dockerfile + target: llamafile-gguf-cuda + args: + GGUF_URL: https://huggingface.co/TheBloke/airoboros-m-7B-3.1.2-dare-0.85-GGUF/resolve/main/airoboros-m-7b-3.1.2-dare-0.85.Q4_K_M.gguf?download=true + GGUF_CHECKSUM: sha256:5d6bc74b99aa89d3c35c90c74d6844e1e45bd810dd08f9f55252f74ed87b0663 + LLAMAFILE_N_GPU_LAYERS: 35 + airoboros-m-7b-3.1.2-dare-0.85.q4_k_m: + image: ghcr.io/ajbouh/cosmos:airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cosmo-3.1.1 + ports: ["8080:8080"] + build: + dockerfile: Dockerfile + target: llamafile-gguf + args: + GGUF_URL: https://huggingface.co/TheBloke/airoboros-m-7B-3.1.2-dare-0.85-GGUF/resolve/main/airoboros-m-7b-3.1.2-dare-0.85.Q4_K_M.gguf?download=true + GGUF_CHECKSUM: sha256:5d6bc74b99aa89d3c35c90c74d6844e1e45bd810dd08f9f55252f74ed87b0663