Merge pull request #2344 from jim60105/dev

chore(docker): Decouple Tensorboard from the project and add CI build workflow.
bmaltais · Apr 25, 2024 · 059afdb · 059afdb
2 parents d7e39c3 + 0138791
commit 059afdb
Show file tree

Hide file tree

Showing 5 changed files with 173 additions and 29 deletions.
diff --git a/.github/workflows/docker_publish.yml b/.github/workflows/docker_publish.yml
@@ -0,0 +1,91 @@
+# Check this guide for more information about publishing to ghcr.io with GitHub Actions:
+# https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#upgrading-a-workflow-that-accesses-ghcrio
+
+# Build the Docker image and push it to the registry
+name: docker_publish
+
+on:
+  # Trigger the workflow on tags push that match the pattern v*, for example v1.0.0
+  push:
+    tags:
+      - "v*"
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  # Only run this job on tags
+  docker-tag:
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/')
+
+    # Sets the permissions granted to the GITHUB_TOKEN for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      # We require additional space due to the large size of our image. (~10GB)
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
+      - name: Docker meta:${{ github.ref_name }}
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/${{ github.repository_owner }}/kohya-ss-gui
+          flavor: |
+            latest=auto
+            prefix=
+            suffix=
+          # https://github.com/docker/metadata-action/tree/v5/?tab=readme-ov-file#tags-input
+          tags: |
+            type=semver,pattern=v{{major}}
+            type=semver,pattern={{raw}}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # You may need to manage write and read access of GitHub Actions for repositories in the container settings.
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        id: publish
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          target: final
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          build-args: |
+            VERSION=${{ github.ref_name }}
+            RELEASE=${{ github.run_number }}
+          platforms: linux/amd64
+          # Cache to regietry instead of gha to avoid the capacity limit.
+          cache-from: type=registry,ref=ghcr.io/${{ github.repository_owner }}/kohya-ss-gui:cache
+          cache-to: type=registry,ref=ghcr.io/${{ github.repository_owner }}/kohya-ss-gui:cache,mode=max
+          sbom: true
+          provenance: true
diff --git a/Dockerfile b/Dockerfile
@@ -22,15 +22,13 @@ RUN --mount=type=cache,id=apt-$TARGETARCH$TARGETVARIANT,sharing=locked,target=/v
     apt-get update && apt-get upgrade -y && \
     apt-get install -y --no-install-recommends python3-launchpadlib git curl
 
-# Install PyTorch and TensorFlow
+# Install PyTorch
 # The versions must align and be in sync with the requirements_linux_docker.txt
 # hadolint ignore=SC2102
 RUN --mount=type=cache,id=pip-$TARGETARCH$TARGETVARIANT,sharing=locked,target=/root/.cache/pip \
     pip install -U --extra-index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.nvidia.com \
     torch==2.1.2 torchvision==0.16.2 \
     xformers==0.0.23.post1 \
-    # Why [and-cuda]: https://github.com/tensorflow/tensorflow/issues/61468#issuecomment-1759462485
-    tensorflow[and-cuda]==2.15.0.post1 \
     ninja \
     pip setuptools wheel
 
@@ -114,14 +112,17 @@ ENV PYTHONPATH="${PYTHONPATH}:/home/$UID/.local/lib/python3.10/site-packages"
 ENV LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
 ENV LD_PRELOAD=libtcmalloc.so
 ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Rich logging
+# https://rich.readthedocs.io/en/stable/console.html#interactive-mode
+ENV FORCE_COLOR="true"
+ENV COLUMNS="100"
 
 WORKDIR /app
 
 VOLUME [ "/dataset" ]
 
 # 7860: Kohya GUI
-# 6006: TensorBoard
-EXPOSE 7860 6006
+EXPOSE 7860
 
 USER $UID
 

diff --git a/README.md b/README.md
@@ -22,6 +22,9 @@ The GUI allows you to set the training parameters and generate and run the requi
       - [Manual installation](#manual-installation)
       - [Pre-built Runpod template](#pre-built-runpod-template)
     - [Docker](#docker)
+      - [Get your Docker ready for GPU support](#get-your-docker-ready-for-gpu-support)
+      - [Design of our Dockerfile](#design-of-our-dockerfile)
+      - [Use the pre-built Docker image](#use-the-pre-built-docker-image)
       - [Local docker build](#local-docker-build)
       - [ashleykleynhans runpod docker builds](#ashleykleynhans-runpod-docker-builds)
   - [Upgrading](#upgrading)
@@ -229,34 +232,67 @@ To run from a pre-built Runpod template, you can:
 
 ### Docker
 
-#### Local docker build
+#### Get your Docker ready for GPU support
 
-If you prefer to use Docker, follow the instructions below:
+##### Windows
 
-1. Ensure that you have Git and Docker installed on your Windows or Linux system.
+Once you have installed [**Docker Desktop**](https://www.docker.com/products/docker-desktop/), [**CUDA Toolkit**](https://developer.nvidia.com/cuda-downloads), [**NVIDIA Windows Driver**](https://www.nvidia.com.tw/Download/index.aspx), and ensured that your Docker is running with [**WSL2**](https://docs.docker.com/desktop/wsl/#turn-on-docker-desktop-wsl-2), you are ready to go.
 
-2. Open your OS shell (Command Prompt or Terminal) and run the following commands:
+Here is the official documentation for further reference.  
+<https://docs.nvidia.com/cuda/wsl-user-guide/index.html#nvidia-compute-software-support-on-wsl-2>
+<https://docs.docker.com/desktop/wsl/use-wsl/#gpu-support>
 
-   ```bash
-   git clone --recursive https://github.com/bmaltais/kohya_ss.git
-   cd kohya_ss
-   docker compose up -d --build
-   ```
+##### Linux, OSX
+
+Install an NVIDIA GPU Driver if you do not already have one installed.  
+<https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html>
+
+Install the NVIDIA Container Toolkit with this guide.  
+<https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html>
+
+#### Design of our Dockerfile
+
+- It is required that all training data is stored in the `dataset` subdirectory, which is mounted into the container at `/dataset`.
+- Please note that the file picker functionality is not available. Instead, you will need to manually input the folder path and configuration file path.
+- TensorBoard has been separated from the project.
+  - TensorBoard is not included in the Docker image.
+  - The "Start TensorBoard" button has been hidden.
+  - TensorBoard is launched from a distinct container [as shown here](/docker-compose.yaml#L41).
+- The browser won't be launched automatically. You will need to manually open the browser and navigate to [http://localhost:7860/](http://localhost:7860/) and [http://localhost:6006/](http://localhost:6006/)
+- This Dockerfile has been designed to be easily disposable. You can discard the container at any time and restart it with the new code version.
+
+#### Use the pre-built Docker image
 
-   Note: The initial run may take up to 20 minutes to complete.
+```bash
+git clone https://github.com/bmaltais/kohya_ss.git
+cd kohya_ss
+docker compose up -d
+```
+
+To update the system, do `docker compose down && docker compose up -d --pull always`
+
+#### Local docker build
+
+> [!IMPORTANT]  
+> Clone the Git repository ***recursively*** to include submodules:  
+> `git clone --recursive https://github.com/bmaltais/kohya_ss.git`
+
+```bash
+git clone --recursive https://github.com/bmaltais/kohya_ss.git
+cd kohya_ss
+docker compose up -d --build
+```
 
-   Please be aware of the following limitations when using Docker:
+> [!NOTE]  
+> Building the image may take up to 20 minutes to complete.
 
-   - All training data must be placed in the `dataset` subdirectory, as the Docker container cannot access files from other directories.
-   - The file picker feature is not functional. You need to manually set the folder path and config file path.
-   - Dialogs may not work as expected, and it is recommended to use unique file names to avoid conflicts.
-   - This Dockerfile has been designed to be easily disposable. You can discard the container at any time and docker build it with a new version of the code. To update the system, run update scripts outside of Docker and rebuild using `docker compose down && docker compose up -d --build`.
+To update the system, ***checkout to the new code version*** and rebuild using `docker compose down && docker compose up -d --build --pull always`
 
-   If you are running Linux, an alternative Docker container port with fewer limitations is available [here](https://github.com/P2Enjoy/kohya_ss-docker).
+> If you are running on Linux, an alternative Docker container port with fewer limitations is available [here](https://github.com/P2Enjoy/kohya_ss-docker).
 
 #### ashleykleynhans runpod docker builds
 
-You may want to use the following Dockerfile repositories to build the images:
+You may want to use the following repositories when running on runpod:
 
 - Standalone Kohya_ss template: <https://github.com/ashleykleynhans/kohya-docker>
 - Auto1111 + Kohya_ss GUI template: <https://github.com/ashleykleynhans/stable-diffusion-docker>

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -1,17 +1,18 @@
-version: "3.8"
-
 services:
   kohya-ss-gui:
     container_name: kohya-ss-gui
-    image: kohya-ss-gui:latest
+    image: ghcr.io/bmaltais/kohya-ss-gui:latest
     user: 1000:0
     build:
       context: .
       args:
         - UID=1000
+      cache_from:
+        - ghcr.io/bmaltais/kohya-ss-gui:cache
+      cache_to:
+        - type=inline
     ports:
       - 7860:7860
-      - 6006:6006
     environment:
       SAFETENSORS_FAST_GPU: 1
     tmpfs:
@@ -35,4 +36,20 @@ services:
           devices:
             - driver: nvidia
               capabilities: [gpu]
-              device_ids: ['all']
+              device_ids: ["all"]
+
+  tensorboard:
+    container_name: tensorboard
+    image: tensorflow/tensorflow:latest-gpu
+    ports:
+      - 6006:6006
+    volumes:
+      - ./dataset/logs:/app/logs
+    command: tensorboard --logdir=/app/logs --bind_all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
+              device_ids: ["all"]
diff --git a/requirements_linux_docker.txt b/requirements_linux_docker.txt
@@ -1,5 +1,4 @@
 xformers>=0.0.20
 bitsandbytes==0.43.0
 accelerate==0.25.0
-tensorboard==2.15.2
-tensorflow==2.15.0.post1
+tensorboard