aws-torch-latest-full #54
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################################################################################ | |
| # DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest) | |
| # | |
| # Runs the full DeepSpeed unit test suite on AWS self-hosted runners. | |
| # Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side | |
| # fallback to 8x A100 nodes when L40S capacity is unavailable. | |
| # | |
| # This workflow runs: | |
| # - Parallel tests with pytest-xdist (-n 8) | |
| # - Sequential tests marked with @pytest.mark.sequential | |
| # - Nightly schedule: skips if no new commits since last successful run | |
| ################################################################################ | |
| name: aws-torch-latest-full | |
| on: | |
| schedule: | |
| - cron: '0 8 * * *' # Daily at 08:00 UTC (midnight PST) | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| check-changes: | |
| name: Check for new commits | |
| runs-on: ubuntu-latest | |
| if: github.event_name == 'schedule' | |
| outputs: | |
| has_changes: ${{ steps.check.outputs.has_changes }} | |
| steps: | |
| - name: Check for commits since last successful run | |
| id: check | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| default_branch="${{ github.event.repository.default_branch }}" | |
| last_sha=$(gh api \ | |
| "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \ | |
| --jq '.workflow_runs[0].head_sha // empty') | |
| current_sha="${{ github.sha }}" | |
| if [ -z "$last_sha" ]; then | |
| echo "No previous successful run found - running tests" | |
| echo "has_changes=true" >> "$GITHUB_OUTPUT" | |
| elif [ "$last_sha" = "$current_sha" ]; then | |
| echo "No new commits since last successful run ($last_sha) - skipping" | |
| echo "has_changes=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "New commits detected: $last_sha -> $current_sha - running tests" | |
| echo "has_changes=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| unit-tests: | |
| name: Unit Tests (Full) | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true') | |
| runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws] | |
| timeout-minutes: 180 | |
| container: | |
| image: nvidia/cuda:12.6.3-devel-ubuntu22.04 | |
| # Mount /mnt/aio for async I/O tests (O_DIRECT requires native filesystem, not overlayfs) | |
| options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio | |
| env: | |
| TORCH_VER: "2.7" | |
| CUDA_VER: "12.6" | |
| CUTLASS_PATH: /opt/cutlass | |
| # Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs | |
| DS_DISABLE_REUSE_DIST_ENV: "1" | |
| steps: | |
| - name: Install system dependencies | |
| run: | | |
| apt-get update && apt-get install -y git git-lfs libaio-dev pdsh python3 python3-pip | |
| git lfs install | |
| ln -sf /usr/bin/python3 /usr/bin/python | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| lfs: true | |
| - name: Install CUTLASS | |
| run: | | |
| git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass | |
| echo "CUTLASS installed at /opt/cutlass" | |
| ls -la /opt/cutlass/include/ | head -10 | |
| - name: Install PyTorch | |
| run: | | |
| pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126 | |
| - name: Install transformers | |
| run: | | |
| git clone https://github.com/huggingface/transformers | |
| cd transformers | |
| git checkout 981c276 | |
| pip install . | |
| - name: Install Python dependencies | |
| run: | | |
| pip install --upgrade pip | |
| pip install -r requirements/requirements.txt | |
| pip install -r requirements/requirements-dev.txt | |
| pip install -r requirements/requirements-deepcompile.txt | |
| pip install pytest-timeout pytest-instafail | |
| - name: Check environment | |
| run: | | |
| echo "=== GPU Information ===" | |
| nvidia-smi | |
| echo "" | |
| echo "=== CUDA Version ===" | |
| nvcc --version | |
| echo "" | |
| echo "=== Python/PyTorch Info ===" | |
| python --version | |
| python -c "import torch; print(f'PyTorch: {torch.__version__}')" | |
| python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" | |
| python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')" | |
| python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')" | |
| echo "" | |
| echo "=== CUTLASS ===" | |
| echo "CUTLASS_PATH: $CUTLASS_PATH" | |
| ls -la $CUTLASS_PATH/include/ | head -5 | |
| - name: Detect GPU architecture | |
| run: | | |
| python - <<'PY' | |
| import os | |
| import torch | |
| torch.cuda.init() | |
| major, minor = torch.cuda.get_device_capability(0) | |
| arch = f"{major}.{minor}" | |
| gpu_count = torch.cuda.device_count() | |
| gpu_name = torch.cuda.get_device_name(0) | |
| with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file: | |
| env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n") | |
| env_file.write(f"GPU_COUNT={gpu_count}\n") | |
| print(f"Detected GPU: {gpu_name}") | |
| print(f"Detected compute capability: {arch}") | |
| print(f"Detected GPU count: {gpu_count}") | |
| PY | |
| - name: Install DeepSpeed | |
| run: | | |
| echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST" | |
| # Initialize CUDA before install so setup.py can detect NCCL version | |
| python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')" | |
| # Use --no-build-isolation so setup.py can access pre-installed PyTorch | |
| pip install --no-build-isolation .[dev,1bit,autotuning,deepcompile] | |
| ds_report | |
| - name: Python environment | |
| run: | | |
| pip list | |
| - name: Unit tests (parallel) | |
| run: | | |
| echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs" | |
| cd tests | |
| # Skip tests requiring unavailable hardware or known issues: | |
| # - nvme checkpointing: no nvme device | |
| # - GDS tests: no GPUDirect Storage support | |
| # - launcher user_args: pdsh requires SSH server | |
| # - zenflow: Stage 3 tests have pre-existing bugs + CUDA/fork issues | |
| rm -rf /mnt/aio/pytest | |
| pytest --instafail --timeout 600 --forked -n 8 --basetemp=/mnt/aio/pytest unit/ \ | |
| --ignore=unit/runtime/zero/test_nvme_checkpointing.py \ | |
| --ignore=unit/ops/aio/test_gds.py \ | |
| --ignore=unit/launcher/test_user_args.py \ | |
| --ignore=unit/runtime/zenflow \ | |
| --ignore=unit/ops/adam/test_zf_torch_adam.py \ | |
| --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }} | |
| - name: Unit tests (sequential) | |
| run: | | |
| echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs" | |
| cd tests | |
| rm -rf /mnt/aio/pytest | |
| pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \ | |
| --ignore=unit/runtime/zero/test_nvme_checkpointing.py \ | |
| --ignore=unit/ops/aio/test_gds.py \ | |
| --ignore=unit/launcher/test_user_args.py \ | |
| --ignore=unit/runtime/zenflow \ | |
| --ignore=unit/ops/adam/test_zf_torch_adam.py \ | |
| --ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \ | |
| --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }} |