22# DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
33#
44# Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
5- # Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
5+ # Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
6+ # fallback to 8x A100 nodes when L40S capacity is unavailable.
67#
78# This workflow runs:
89# - Parallel tests with pytest-xdist (-n 8)
910# - Sequential tests marked with @pytest.mark.sequential
10- #
11- # Nightly schedule: skips if no new commits since last successful run.
11+ # - Nightly schedule: skips if no new commits since last successful run
1212# ###############################################################################
1313
1414name : aws-torch-latest-full
1717 schedule :
1818 - cron : ' 0 8 * * *' # Daily at 08:00 UTC (midnight PST)
1919 workflow_dispatch :
20+ inputs :
21+ torch_preset :
22+ description : PyTorch preset to install for manual runs
23+ required : false
24+ default : ' 2.7.1-cu126'
25+ type : choice
26+ options :
27+ - ' 2.7.1-cu126'
28+ - ' 2.8.0-cu126'
29+ - ' 2.9.1-cu126'
30+ - ' 2.10.0-cu126'
31+ - ' 2.11.0-cu126'
2032
2133concurrency :
2234 group : ${{ github.workflow }}-${{ github.ref }}
2638 check-changes :
2739 name : Check for new commits
2840 runs-on : ubuntu-latest
29- # Only check on schedule; workflow_dispatch always runs
3041 if : github.event_name == 'schedule'
3142 outputs :
3243 has_changes : ${{ steps.check.outputs.has_changes }}
@@ -38,28 +49,26 @@ jobs:
3849 run : |
3950 default_branch="${{ github.event.repository.default_branch }}"
4051
41- # Get the HEAD SHA of the last successful run of this workflow
4252 last_sha=$(gh api \
43- "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
53+ "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&event=schedule& branch=${default_branch}&per_page=1" \
4454 --jq '.workflow_runs[0].head_sha // empty')
4555
4656 current_sha="${{ github.sha }}"
4757
4858 if [ -z "$last_sha" ]; then
49- echo "No previous successful run found — running tests"
59+ echo "No previous successful run found - running tests"
5060 echo "has_changes=true" >> "$GITHUB_OUTPUT"
5161 elif [ "$last_sha" = "$current_sha" ]; then
52- echo "No new commits since last successful run ($last_sha) — skipping"
62+ echo "No new commits since last successful run ($last_sha) - skipping"
5363 echo "has_changes=false" >> "$GITHUB_OUTPUT"
5464 else
55- echo "New commits detected: $last_sha -> $current_sha — running tests"
65+ echo "New commits detected: $last_sha -> $current_sha - running tests"
5666 echo "has_changes=true" >> "$GITHUB_OUTPUT"
5767 fi
5868
5969 unit-tests :
6070 name : Unit Tests (Full)
6171 needs : [check-changes]
62- # Run if: (a) workflow_dispatch, or (b) schedule with new commits
6372 if : |
6473 always() &&
6574 (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
@@ -72,11 +81,10 @@ jobs:
7281 options : --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
7382
7483 env :
75- TORCH_VER : " 2.7"
76- CUDA_VER : " 12.6"
84+ DEFAULT_TORCH_PRESET : ' 2.7.1-cu126'
7785 CUTLASS_PATH : /opt/cutlass
7886 # Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
79- DS_DISABLE_REUSE_DIST_ENV : " 1 "
87+ DS_DISABLE_REUSE_DIST_ENV : ' 1 '
8088
8189 steps :
8290 - name : Install system dependencies
9098 with :
9199 lfs : true
92100
101+ - name : Resolve PyTorch preset
102+ env :
103+ GITHUB_EVENT_NAME : ${{ github.event_name }}
104+ MANUAL_TORCH_PRESET : ${{ github.event.inputs.torch_preset || '' }}
105+ run : |
106+ if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TORCH_PRESET" ]; then
107+ selected_preset="$MANUAL_TORCH_PRESET"
108+ else
109+ selected_preset="$DEFAULT_TORCH_PRESET"
110+ fi
111+
112+ case "$selected_preset" in
113+ '2.7.1-cu126')
114+ torch_install_version='2.7.1'
115+ torchvision_install_version='0.22.1'
116+ torchaudio_install_version='2.7.1'
117+ torch_test_version='2.7'
118+ cuda_test_version='12.6'
119+ pytorch_index_url='https://download.pytorch.org/whl/cu126'
120+ ;;
121+ '2.8.0-cu126')
122+ torch_install_version='2.8.0'
123+ torchvision_install_version='0.23.0'
124+ torchaudio_install_version='2.8.0'
125+ torch_test_version='2.8'
126+ cuda_test_version='12.6'
127+ pytorch_index_url='https://download.pytorch.org/whl/cu126'
128+ ;;
129+ '2.9.1-cu126')
130+ torch_install_version='2.9.1'
131+ torchvision_install_version='0.24.1'
132+ torchaudio_install_version='2.9.1'
133+ torch_test_version='2.9'
134+ cuda_test_version='12.6'
135+ pytorch_index_url='https://download.pytorch.org/whl/cu126'
136+ ;;
137+ '2.10.0-cu126')
138+ torch_install_version='2.10.0'
139+ torchvision_install_version='0.25.0'
140+ torchaudio_install_version='2.10.0'
141+ torch_test_version='2.10'
142+ cuda_test_version='12.6'
143+ pytorch_index_url='https://download.pytorch.org/whl/cu126'
144+ ;;
145+ '2.11.0-cu126')
146+ torch_install_version='2.11.0'
147+ torchvision_install_version='0.26.0'
148+ torchaudio_install_version='2.11.0'
149+ torch_test_version='2.11'
150+ cuda_test_version='12.6'
151+ pytorch_index_url='https://download.pytorch.org/whl/cu126'
152+ ;;
153+ *)
154+ echo "Unsupported torch_preset: $selected_preset" >&2
155+ exit 1
156+ ;;
157+ esac
158+
159+ {
160+ echo "SELECTED_TORCH_PRESET=$selected_preset"
161+ echo "TORCH_INSTALL_VERSION=$torch_install_version"
162+ echo "TORCHVISION_INSTALL_VERSION=$torchvision_install_version"
163+ echo "TORCHAUDIO_INSTALL_VERSION=$torchaudio_install_version"
164+ echo "TORCH_TEST_VERSION=$torch_test_version"
165+ echo "CUDA_TEST_VERSION=$cuda_test_version"
166+ echo "PYTORCH_INDEX_URL=$pytorch_index_url"
167+ } >> "$GITHUB_ENV"
168+
169+ echo "Selected preset: $selected_preset"
170+ echo "Resolved install tuple: torch==$torch_install_version torchvision==$torchvision_install_version torchaudio==$torchaudio_install_version"
171+ echo "Resolved test expectations: torch=$torch_test_version cuda=$cuda_test_version"
172+ echo "Resolved PyTorch index: $pytorch_index_url"
173+
93174 - name : Install CUTLASS
94175 run : |
95176 git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
@@ -98,7 +179,11 @@ jobs:
98179
99180 - name : Install PyTorch
100181 run : |
101- pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
182+ pip install \
183+ torch=="$TORCH_INSTALL_VERSION" \
184+ torchvision=="$TORCHVISION_INSTALL_VERSION" \
185+ torchaudio=="$TORCHAUDIO_INSTALL_VERSION" \
186+ --index-url "$PYTORCH_INDEX_URL"
102187
103188 - name : Install transformers
104189 run : |
@@ -117,6 +202,12 @@ jobs:
117202
118203 - name : Check environment
119204 run : |
205+ echo "=== Selected PyTorch Preset ==="
206+ echo "Preset: $SELECTED_TORCH_PRESET"
207+ echo "Install tuple: torch==$TORCH_INSTALL_VERSION torchvision==$TORCHVISION_INSTALL_VERSION torchaudio==$TORCHAUDIO_INSTALL_VERSION"
208+ echo "PyTorch index URL: $PYTORCH_INDEX_URL"
209+ echo "Expected test versions: torch=$TORCH_TEST_VERSION cuda=$CUDA_TEST_VERSION"
210+ echo ""
120211 echo "=== GPU Information ==="
121212 nvidia-smi
122213 echo ""
@@ -132,10 +223,32 @@ jobs:
132223 echo ""
133224 echo "=== CUTLASS ==="
134225 echo "CUTLASS_PATH: $CUTLASS_PATH"
135- ls -la $CUTLASS_PATH/include/ | head -5
226+ ls -la "$CUTLASS_PATH"/include/ | head -5
227+
228+ - name : Detect GPU architecture
229+ run : |
230+ python - <<'PY'
231+ import os
232+ import torch
233+
234+ torch.cuda.init()
235+ major, minor = torch.cuda.get_device_capability(0)
236+ arch = f"{major}.{minor}"
237+ gpu_count = torch.cuda.device_count()
238+ gpu_name = torch.cuda.get_device_name(0)
239+
240+ with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
241+ env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
242+ env_file.write(f"GPU_COUNT={gpu_count}\n")
243+
244+ print(f"Detected GPU: {gpu_name}")
245+ print(f"Detected compute capability: {arch}")
246+ print(f"Detected GPU count: {gpu_count}")
247+ PY
136248
137249 - name : Install DeepSpeed
138250 run : |
251+ echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
139252 # Initialize CUDA before install so setup.py can detect NCCL version
140253 python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
141254 # Use --no-build-isolation so setup.py can access pre-installed PyTorch
@@ -148,7 +261,7 @@ jobs:
148261
149262 - name : Unit tests (parallel)
150263 run : |
151- export TORCH_CUDA_ARCH_LIST="8.9 "
264+ echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs "
152265 cd tests
153266 # Skip tests requiring unavailable hardware or known issues:
154267 # - nvme checkpointing: no nvme device
@@ -162,11 +275,11 @@ jobs:
162275 --ignore=unit/launcher/test_user_args.py \
163276 --ignore=unit/runtime/zenflow \
164277 --ignore=unit/ops/adam/test_zf_torch_adam.py \
165- --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
278+ --torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"
166279
167280 - name : Unit tests (sequential)
168281 run : |
169- export TORCH_CUDA_ARCH_LIST="8.9 "
282+ echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs "
170283 cd tests
171284 rm -rf /mnt/aio/pytest
172285 pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
@@ -176,4 +289,4 @@ jobs:
176289 --ignore=unit/runtime/zenflow \
177290 --ignore=unit/ops/adam/test_zf_torch_adam.py \
178291 --ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \
179- --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
292+ --torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"
0 commit comments