Skip to content

Commit a004b49

Browse files
Merge branch 'master' into fix/bf16-zero3-quantized-weights
2 parents 3d0c53c + 0ba2352 commit a004b49

117 files changed

Lines changed: 5637 additions & 854 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/aws-torch-latest-full.yml

Lines changed: 132 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
# DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
33
#
44
# Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
5-
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
5+
# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
6+
# fallback to 8x A100 nodes when L40S capacity is unavailable.
67
#
78
# This workflow runs:
89
# - Parallel tests with pytest-xdist (-n 8)
910
# - Sequential tests marked with @pytest.mark.sequential
10-
#
11-
# Nightly schedule: skips if no new commits since last successful run.
11+
# - Nightly schedule: skips if no new commits since last successful run
1212
################################################################################
1313

1414
name: aws-torch-latest-full
@@ -17,6 +17,18 @@ on:
1717
schedule:
1818
- cron: '0 8 * * *' # Daily at 08:00 UTC (midnight PST)
1919
workflow_dispatch:
20+
inputs:
21+
torch_preset:
22+
description: PyTorch preset to install for manual runs
23+
required: false
24+
default: '2.7.1-cu126'
25+
type: choice
26+
options:
27+
- '2.7.1-cu126'
28+
- '2.8.0-cu126'
29+
- '2.9.1-cu126'
30+
- '2.10.0-cu126'
31+
- '2.11.0-cu126'
2032

2133
concurrency:
2234
group: ${{ github.workflow }}-${{ github.ref }}
@@ -26,7 +38,6 @@ jobs:
2638
check-changes:
2739
name: Check for new commits
2840
runs-on: ubuntu-latest
29-
# Only check on schedule; workflow_dispatch always runs
3041
if: github.event_name == 'schedule'
3142
outputs:
3243
has_changes: ${{ steps.check.outputs.has_changes }}
@@ -38,28 +49,26 @@ jobs:
3849
run: |
3950
default_branch="${{ github.event.repository.default_branch }}"
4051
41-
# Get the HEAD SHA of the last successful run of this workflow
4252
last_sha=$(gh api \
43-
"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
53+
"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&event=schedule&branch=${default_branch}&per_page=1" \
4454
--jq '.workflow_runs[0].head_sha // empty')
4555
4656
current_sha="${{ github.sha }}"
4757
4858
if [ -z "$last_sha" ]; then
49-
echo "No previous successful run found running tests"
59+
echo "No previous successful run found - running tests"
5060
echo "has_changes=true" >> "$GITHUB_OUTPUT"
5161
elif [ "$last_sha" = "$current_sha" ]; then
52-
echo "No new commits since last successful run ($last_sha) skipping"
62+
echo "No new commits since last successful run ($last_sha) - skipping"
5363
echo "has_changes=false" >> "$GITHUB_OUTPUT"
5464
else
55-
echo "New commits detected: $last_sha -> $current_sha running tests"
65+
echo "New commits detected: $last_sha -> $current_sha - running tests"
5666
echo "has_changes=true" >> "$GITHUB_OUTPUT"
5767
fi
5868
5969
unit-tests:
6070
name: Unit Tests (Full)
6171
needs: [check-changes]
62-
# Run if: (a) workflow_dispatch, or (b) schedule with new commits
6372
if: |
6473
always() &&
6574
(github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
@@ -72,11 +81,10 @@ jobs:
7281
options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
7382

7483
env:
75-
TORCH_VER: "2.7"
76-
CUDA_VER: "12.6"
84+
DEFAULT_TORCH_PRESET: '2.7.1-cu126'
7785
CUTLASS_PATH: /opt/cutlass
7886
# Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
79-
DS_DISABLE_REUSE_DIST_ENV: "1"
87+
DS_DISABLE_REUSE_DIST_ENV: '1'
8088

8189
steps:
8290
- name: Install system dependencies
@@ -90,6 +98,79 @@ jobs:
9098
with:
9199
lfs: true
92100

101+
- name: Resolve PyTorch preset
102+
env:
103+
GITHUB_EVENT_NAME: ${{ github.event_name }}
104+
MANUAL_TORCH_PRESET: ${{ github.event.inputs.torch_preset || '' }}
105+
run: |
106+
if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TORCH_PRESET" ]; then
107+
selected_preset="$MANUAL_TORCH_PRESET"
108+
else
109+
selected_preset="$DEFAULT_TORCH_PRESET"
110+
fi
111+
112+
case "$selected_preset" in
113+
'2.7.1-cu126')
114+
torch_install_version='2.7.1'
115+
torchvision_install_version='0.22.1'
116+
torchaudio_install_version='2.7.1'
117+
torch_test_version='2.7'
118+
cuda_test_version='12.6'
119+
pytorch_index_url='https://download.pytorch.org/whl/cu126'
120+
;;
121+
'2.8.0-cu126')
122+
torch_install_version='2.8.0'
123+
torchvision_install_version='0.23.0'
124+
torchaudio_install_version='2.8.0'
125+
torch_test_version='2.8'
126+
cuda_test_version='12.6'
127+
pytorch_index_url='https://download.pytorch.org/whl/cu126'
128+
;;
129+
'2.9.1-cu126')
130+
torch_install_version='2.9.1'
131+
torchvision_install_version='0.24.1'
132+
torchaudio_install_version='2.9.1'
133+
torch_test_version='2.9'
134+
cuda_test_version='12.6'
135+
pytorch_index_url='https://download.pytorch.org/whl/cu126'
136+
;;
137+
'2.10.0-cu126')
138+
torch_install_version='2.10.0'
139+
torchvision_install_version='0.25.0'
140+
torchaudio_install_version='2.10.0'
141+
torch_test_version='2.10'
142+
cuda_test_version='12.6'
143+
pytorch_index_url='https://download.pytorch.org/whl/cu126'
144+
;;
145+
'2.11.0-cu126')
146+
torch_install_version='2.11.0'
147+
torchvision_install_version='0.26.0'
148+
torchaudio_install_version='2.11.0'
149+
torch_test_version='2.11'
150+
cuda_test_version='12.6'
151+
pytorch_index_url='https://download.pytorch.org/whl/cu126'
152+
;;
153+
*)
154+
echo "Unsupported torch_preset: $selected_preset" >&2
155+
exit 1
156+
;;
157+
esac
158+
159+
{
160+
echo "SELECTED_TORCH_PRESET=$selected_preset"
161+
echo "TORCH_INSTALL_VERSION=$torch_install_version"
162+
echo "TORCHVISION_INSTALL_VERSION=$torchvision_install_version"
163+
echo "TORCHAUDIO_INSTALL_VERSION=$torchaudio_install_version"
164+
echo "TORCH_TEST_VERSION=$torch_test_version"
165+
echo "CUDA_TEST_VERSION=$cuda_test_version"
166+
echo "PYTORCH_INDEX_URL=$pytorch_index_url"
167+
} >> "$GITHUB_ENV"
168+
169+
echo "Selected preset: $selected_preset"
170+
echo "Resolved install tuple: torch==$torch_install_version torchvision==$torchvision_install_version torchaudio==$torchaudio_install_version"
171+
echo "Resolved test expectations: torch=$torch_test_version cuda=$cuda_test_version"
172+
echo "Resolved PyTorch index: $pytorch_index_url"
173+
93174
- name: Install CUTLASS
94175
run: |
95176
git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
@@ -98,7 +179,11 @@ jobs:
98179
99180
- name: Install PyTorch
100181
run: |
101-
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
182+
pip install \
183+
torch=="$TORCH_INSTALL_VERSION" \
184+
torchvision=="$TORCHVISION_INSTALL_VERSION" \
185+
torchaudio=="$TORCHAUDIO_INSTALL_VERSION" \
186+
--index-url "$PYTORCH_INDEX_URL"
102187
103188
- name: Install transformers
104189
run: |
@@ -117,6 +202,12 @@ jobs:
117202
118203
- name: Check environment
119204
run: |
205+
echo "=== Selected PyTorch Preset ==="
206+
echo "Preset: $SELECTED_TORCH_PRESET"
207+
echo "Install tuple: torch==$TORCH_INSTALL_VERSION torchvision==$TORCHVISION_INSTALL_VERSION torchaudio==$TORCHAUDIO_INSTALL_VERSION"
208+
echo "PyTorch index URL: $PYTORCH_INDEX_URL"
209+
echo "Expected test versions: torch=$TORCH_TEST_VERSION cuda=$CUDA_TEST_VERSION"
210+
echo ""
120211
echo "=== GPU Information ==="
121212
nvidia-smi
122213
echo ""
@@ -132,10 +223,32 @@ jobs:
132223
echo ""
133224
echo "=== CUTLASS ==="
134225
echo "CUTLASS_PATH: $CUTLASS_PATH"
135-
ls -la $CUTLASS_PATH/include/ | head -5
226+
ls -la "$CUTLASS_PATH"/include/ | head -5
227+
228+
- name: Detect GPU architecture
229+
run: |
230+
python - <<'PY'
231+
import os
232+
import torch
233+
234+
torch.cuda.init()
235+
major, minor = torch.cuda.get_device_capability(0)
236+
arch = f"{major}.{minor}"
237+
gpu_count = torch.cuda.device_count()
238+
gpu_name = torch.cuda.get_device_name(0)
239+
240+
with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
241+
env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
242+
env_file.write(f"GPU_COUNT={gpu_count}\n")
243+
244+
print(f"Detected GPU: {gpu_name}")
245+
print(f"Detected compute capability: {arch}")
246+
print(f"Detected GPU count: {gpu_count}")
247+
PY
136248
137249
- name: Install DeepSpeed
138250
run: |
251+
echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
139252
# Initialize CUDA before install so setup.py can detect NCCL version
140253
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
141254
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
@@ -148,7 +261,7 @@ jobs:
148261
149262
- name: Unit tests (parallel)
150263
run: |
151-
export TORCH_CUDA_ARCH_LIST="8.9"
264+
echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
152265
cd tests
153266
# Skip tests requiring unavailable hardware or known issues:
154267
# - nvme checkpointing: no nvme device
@@ -162,11 +275,11 @@ jobs:
162275
--ignore=unit/launcher/test_user_args.py \
163276
--ignore=unit/runtime/zenflow \
164277
--ignore=unit/ops/adam/test_zf_torch_adam.py \
165-
--torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
278+
--torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"
166279
167280
- name: Unit tests (sequential)
168281
run: |
169-
export TORCH_CUDA_ARCH_LIST="8.9"
282+
echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
170283
cd tests
171284
rm -rf /mnt/aio/pytest
172285
pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
@@ -176,4 +289,4 @@ jobs:
176289
--ignore=unit/runtime/zenflow \
177290
--ignore=unit/ops/adam/test_zf_torch_adam.py \
178291
--ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \
179-
--torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
292+
--torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"

.github/workflows/nv-pre-compile-ops.yml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,20 @@ jobs:
2323
unit-tests:
2424
runs-on: ubuntu-24.04
2525
container:
26-
image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
26+
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
2727

2828
steps:
29+
- name: Install system dependencies
30+
run: |
31+
apt-get update && apt-get install -y git python3 python3-pip libaio-dev ninja-build
32+
ln -sf /usr/bin/python3 /usr/bin/python
33+
2934
- uses: actions/checkout@v4
3035

36+
- name: Install PyTorch
37+
run: |
38+
pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu126
39+
3140
- name: environment
3241
run: |
3342
which python
@@ -36,7 +45,7 @@ jobs:
3645
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
3746
- name: Compile DeepSpeed Ops
3847
run: |
39-
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
48+
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
4049
- name: DS Report
4150
run: |
42-
ds_report
51+
DS_ACCELERATOR=cuda ds_report

.github/workflows/xpu-compile.yml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
compile-tests:
2121
runs-on: [self-hosted, intel, xpu]
2222
container:
23-
image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
23+
image: intel/oneapi-basekit:2025.0.2-0-devel-ubuntu22.04
2424
ports:
2525
- 80
2626
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
@@ -31,11 +31,7 @@ jobs:
3131
run: |
3232
apt-get update
3333
apt-get install clinfo libaio-dev python3-pip -y
34-
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
35-
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
36-
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
37-
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
38-
pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
34+
pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
3935
pip install py-cpuinfo numpy
4036
pip install .[dev,autotuning]
4137
@@ -44,7 +40,7 @@ jobs:
4440
ldd --version
4541
ds_report
4642
python3 -c "import torch; print('torch:', torch.__version__, torch)"
47-
python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
43+
python3 -c "import torch; print('XPU available:', torch.xpu.is_available())"
4844
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
4945
pip list
5046

.github/workflows/xpu-max1100.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ jobs:
5050
apt-get install -y python3.11 python3.11-dev python3-pip clinfo libaio-dev
5151
pip install --upgrade pip
5252
pip install py-cpuinfo
53-
pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
54-
pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us --trusted-host pytorch-extension.intel.com
53+
pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
5554
pip install .[dev,autotuning]
5655
5756
- name: Check container state
@@ -60,7 +59,7 @@ jobs:
6059
ldd --version
6160
ds_report
6261
python3 -c "import torch; print('torch:', torch.__version__, torch)"
63-
python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
62+
python3 -c "import torch; print('XPU available:', torch.xpu.is_available())"
6463
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
6564
pip list
6665

AGENTS.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<!-- This file is duplicated as CLAUDE.md and AGENTS.md. Keep them in sync. -->
2+
# AGENTS.md — Workspace-level instructions for AI coding agents
3+
4+
## DeepSpeed Project Rules
5+
6+
### Commit & CI requirements
7+
8+
- All commits MUST have a `Signed-off-by` line (use `--signoff`). Get the name and email from `git config user.name` / `git config user.email`.
9+
- Formatting: yapf (column_limit=119, `.style.yapf`) + flake8 (`.flake8`).
10+
- Always verify changed files pass pre-commit checks before committing: `pre-commit run --files <changed_files>`. Only check modified files, not the entire codebase. Config: `.pre-commit-config.yaml`.
11+
- `check-torchdist` hook: NEVER directly import torch's distributed module. Use `import deepspeed.comm as dist` instead.
12+
- New files require license header:
13+
```
14+
# SPDX-License-Identifier: Apache-2.0
15+
# DeepSpeed Team
16+
```
17+
18+
### Code change discipline
19+
20+
- NEVER make cosmetic/formatting-only changes to existing code. Only add/modify lines that are functionally necessary. Minimizing diff noise is critical for code review.
21+
- Delete dead code decisively — if code is unused at runtime (only referenced in tests), remove it along with its tests.
22+
- Prefer consolidating tests over proliferating test files.
23+
- Blend in: when modifying code, read the surrounding context and match the style of neighboring code (naming, spacing, patterns, idioms).
24+
- Write beginner-friendly code: avoid deeply nested expressions or chained logic. Break complex expressions into clear, named intermediate steps.
25+
- Comments should explain **why**, not **what**. Describe the purpose and reasoning, not the mechanics that the code already shows.
26+
- New features must include corresponding tests and documentation updates.
27+
28+
## Tool Caveats
29+
30+
### Edit tool auto-formatter
31+
32+
The Edit tool has a hidden auto-formatter that silently changes quotes, whitespace, blank lines, and line wrapping. For format-sensitive modifications (e.g., when exact formatting matters for pre-commit), use `bash` with `sed`, `python`, or `cat` instead.

0 commit comments

Comments
 (0)