diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 470013862..0fc4f551c 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -85,36 +85,36 @@ jobs: fail-fast: false matrix: image: - - name: falcon-7b - node-count: 1 - node-vm-size: Standard_NC12s_v3 - node-osdisk-size: 100 - - - name: falcon-7b-instruct - node-count: 1 - node-vm-size: Standard_NC12s_v3 - node-osdisk-size: 100 - - # Uncomment once service/deployment made - # - name: falcon-40b + # - name: falcon-7b # node-count: 1 - # node-vm-size: Standard_NC96ads_A100_v4 - # node-osdisk-size: 400 + # node-vm-size: Standard_NC12s_v3 + # node-osdisk-size: 100 - # - name: falcon-40b-instruct + # - name: falcon-7b-instruct # node-count: 1 - # node-vm-size: Standard_NC96ads_A100_v4 - # node-osdisk-size: 400 + # node-vm-size: Standard_NC12s_v3 + # node-osdisk-size: 100 - - name: llama-2-7b + # Uncomment once service/deployment made + - name: falcon-40b node-count: 1 - node-vm-size: Standard_NC12s_v3 - node-osdisk-size: 100 + node-vm-size: Standard_NC96ads_A100_v4 + node-osdisk-size: 400 + + - name: falcon-40b-instruct + node-count: 1 + node-vm-size: Standard_NC96ads_A100_v4 + node-osdisk-size: 400 + + # - name: llama-2-7b + # node-count: 1 + # node-vm-size: Standard_NC12s_v3 + # node-osdisk-size: 100 - - name: llama-2-13b - node-count: 2 - node-vm-size: Standard_NC12s_v3 - node-osdisk-size: 150 + # - name: llama-2-13b + # node-count: 2 + # node-vm-size: Standard_NC12s_v3 + # node-osdisk-size: 150 # Uncomment once service/deployment made # - name: llama-2-70b diff --git a/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml new file mode 100644 index 000000000..cc41fb6f7 --- /dev/null +++ b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: falcon-40b-instruct +spec: + selector: + app: falcon + statefulset.kubernetes.io/pod-name: falcon-40b-instruct-0 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true diff --git a/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml new file mode 100644 index 000000000..0c39ec31e --- /dev/null +++ b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: falcon-40b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + podManagementPolicy: Parallel + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: n40binstruct diff --git a/presets/k8s/falcon-40b/falcon-40b-instruct-statefulset.yaml b/presets/k8s/falcon-40b/falcon-40b-instruct-statefulset.yaml new file mode 100644 index 000000000..c213867df --- /dev/null +++ b/presets/k8s/falcon-40b/falcon-40b-instruct-statefulset.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: falcon-40b +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + podManagementPolicy: Parallel + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40b diff --git a/presets/k8s/falcon-40b/falcon-40b-service.yaml b/presets/k8s/falcon-40b/falcon-40b-service.yaml new file mode 100644 index 000000000..599f70ca5 --- /dev/null +++ b/presets/k8s/falcon-40b/falcon-40b-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: falcon-40b +spec: + selector: + app: falcon + statefulset.kubernetes.io/pod-name: falcon-40b-0 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true