forked from kubeflow/training-operator
-
Notifications
You must be signed in to change notification settings - Fork 11
126 lines (113 loc) · 4.24 KB
/
integration-tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
name: integration test
on:
- pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
integration-test:
runs-on: ubuntu-latest
# Almost similar to the following:
#
# ```yaml
# strategy:
# fail-fast: false
# matrix:
# kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
# gang-scheduler-name: ["none", "scheduler-plugins", "volcano"]
# ```
# The difference is that each combination is randomly assigned various Python versions
# to verify Python SDK operations.
strategy:
fail-fast: false
matrix:
# TODO (tenzen-y): Add volcano.
include:
- kubernetes-version: v1.29.2
gang-scheduler-name: "none"
python-version: "3.10"
- kubernetes-version: v1.27.11
gang-scheduler-name: "none"
python-version: "3.7"
- kubernetes-version: v1.28.7
gang-scheduler-name: "none"
python-version: "3.8"
- kubernetes-version: v1.29.2
gang-scheduler-name: "scheduler-plugins"
python-version: "3.9"
- kubernetes-version: v1.27.11
gang-scheduler-name: "scheduler-plugins"
python-version: "3.10"
- kubernetes-version: v1.28.7
gang-scheduler-name: "scheduler-plugins"
python-version: "3.10"
- kubernetes-version: v1.29.2
gang-scheduler-name: "volcano"
python-version: "3.9"
- kubernetes-version: v1.27.11
gang-scheduler-name: "volcano"
python-version: "3.10"
- kubernetes-version: v1.28.7
gang-scheduler-name: "volcano"
python-version: "3.10"
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -h
- name: Checkout
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version-file: go.mod
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.3.0
with:
node_image: kindest/node:${{ matrix.kubernetes-version }}
cluster_name: training-operator-cluster
kubectl_version: ${{ matrix.kubernetes-version }}
- name: Build training-operator
run: |
./scripts/gha/build-image.sh
env:
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
- name: Deploy training operator
run: |
./scripts/gha/setup-training-operator.sh
env:
KIND_CLUSTER: training-operator-cluster
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
env:
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
- name: Collect volcano logs
if: ${{ failure() && matrix.gang-scheduler-name == 'volcano' }}
run: |
echo "dump volcano-scheduler logs..."
kubectl logs -n volcano-system -l app=volcano-scheduler --tail=-1
echo "dump volcano-admission logs..."
kubectl logs -n volcano-system -l app=volcano-admission --tail=-1
echo "dump volcano-controllers logs..."
kubectl logs -n volcano-system -l app=volcano-controller --tail=-1
echo "dump podgroups description..."
kubectl describe podgroups.scheduling.volcano.sh -A