aws-torch-latest-full #54

Workflow file for this run

.github/workflows/aws-torch-latest-full.yml at 37e232f

	################################################################################
	# DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
	#
	# Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
	# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
	# fallback to 8x A100 nodes when L40S capacity is unavailable.
	#
	# This workflow runs:
	# - Parallel tests with pytest-xdist (-n 8)
	# - Sequential tests marked with @pytest.mark.sequential
	# - Nightly schedule: skips if no new commits since last successful run
	################################################################################

	name: aws-torch-latest-full

	on:
	schedule:
	- cron: '0 8 * * *' # Daily at 08:00 UTC (midnight PST)
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	check-changes:
	name: Check for new commits
	runs-on: ubuntu-latest
	if: github.event_name == 'schedule'
	outputs:
	has_changes: ${{ steps.check.outputs.has_changes }}
	steps:
	- name: Check for commits since last successful run
	id: check
	env:
	GH_TOKEN: ${{ github.token }}
	run: \|
	default_branch="${{ github.event.repository.default_branch }}"

	last_sha=$(gh api \
	"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
	--jq '.workflow_runs[0].head_sha // empty')

	current_sha="${{ github.sha }}"

	if [ -z "$last_sha" ]; then
	echo "No previous successful run found - running tests"
	echo "has_changes=true" >> "$GITHUB_OUTPUT"
	elif [ "$last_sha" = "$current_sha" ]; then
	echo "No new commits since last successful run ($last_sha) - skipping"
	echo "has_changes=false" >> "$GITHUB_OUTPUT"
	else
	echo "New commits detected: $last_sha -> $current_sha - running tests"
	echo "has_changes=true" >> "$GITHUB_OUTPUT"
	fi

	unit-tests:
	name: Unit Tests (Full)
	needs: [check-changes]
	if: \|
	always() &&
	(github.event_name == 'workflow_dispatch' \|\| needs.check-changes.outputs.has_changes == 'true')
	runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
	timeout-minutes: 180

	container:
	image: nvidia/cuda:12.6.3-devel-ubuntu22.04
	# Mount /mnt/aio for async I/O tests (O_DIRECT requires native filesystem, not overlayfs)
	options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio

	env:
	TORCH_VER: "2.7"
	CUDA_VER: "12.6"
	CUTLASS_PATH: /opt/cutlass
	# Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
	DS_DISABLE_REUSE_DIST_ENV: "1"

	steps:
	- name: Install system dependencies
	run: \|
	apt-get update && apt-get install -y git git-lfs libaio-dev pdsh python3 python3-pip
	git lfs install
	ln -sf /usr/bin/python3 /usr/bin/python

	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	lfs: true

	- name: Install CUTLASS
	run: \|
	git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
	echo "CUTLASS installed at /opt/cutlass"
	ls -la /opt/cutlass/include/ \| head -10

	- name: Install PyTorch
	run: \|
	pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126

	- name: Install transformers
	run: \|
	git clone https://github.com/huggingface/transformers
	cd transformers
	git checkout 981c276
	pip install .

	- name: Install Python dependencies
	run: \|
	pip install --upgrade pip
	pip install -r requirements/requirements.txt
	pip install -r requirements/requirements-dev.txt
	pip install -r requirements/requirements-deepcompile.txt
	pip install pytest-timeout pytest-instafail

	- name: Check environment
	run: \|
	echo "=== GPU Information ==="
	nvidia-smi
	echo ""
	echo "=== CUDA Version ==="
	nvcc --version
	echo ""
	echo "=== Python/PyTorch Info ==="
	python --version
	python -c "import torch; print(f'PyTorch: {torch.__version__}')"
	python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
	python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
	python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
	echo ""
	echo "=== CUTLASS ==="
	echo "CUTLASS_PATH: $CUTLASS_PATH"
	ls -la $CUTLASS_PATH/include/ \| head -5

	- name: Detect GPU architecture
	run: \|
	python - <<'PY'
	import os
	import torch

	torch.cuda.init()
	major, minor = torch.cuda.get_device_capability(0)
	arch = f"{major}.{minor}"
	gpu_count = torch.cuda.device_count()
	gpu_name = torch.cuda.get_device_name(0)

	with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
	env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
	env_file.write(f"GPU_COUNT={gpu_count}\n")

	print(f"Detected GPU: {gpu_name}")
	print(f"Detected compute capability: {arch}")
	print(f"Detected GPU count: {gpu_count}")
	PY

	- name: Install DeepSpeed
	run: \|
	echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
	# Initialize CUDA before install so setup.py can detect NCCL version
	python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
	# Use --no-build-isolation so setup.py can access pre-installed PyTorch
	pip install --no-build-isolation .[dev,1bit,autotuning,deepcompile]
	ds_report

	- name: Python environment
	run: \|
	pip list

	- name: Unit tests (parallel)
	run: \|
	echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
	cd tests
	# Skip tests requiring unavailable hardware or known issues:
	# - nvme checkpointing: no nvme device
	# - GDS tests: no GPUDirect Storage support
	# - launcher user_args: pdsh requires SSH server
	# - zenflow: Stage 3 tests have pre-existing bugs + CUDA/fork issues
	rm -rf /mnt/aio/pytest
	pytest --instafail --timeout 600 --forked -n 8 --basetemp=/mnt/aio/pytest unit/ \
	--ignore=unit/runtime/zero/test_nvme_checkpointing.py \
	--ignore=unit/ops/aio/test_gds.py \
	--ignore=unit/launcher/test_user_args.py \
	--ignore=unit/runtime/zenflow \
	--ignore=unit/ops/adam/test_zf_torch_adam.py \
	--torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}

	- name: Unit tests (sequential)
	run: \|
	echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
	cd tests
	rm -rf /mnt/aio/pytest
	pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
	--ignore=unit/runtime/zero/test_nvme_checkpointing.py \
	--ignore=unit/ops/aio/test_gds.py \
	--ignore=unit/launcher/test_user_args.py \
	--ignore=unit/runtime/zenflow \
	--ignore=unit/ops/adam/test_zf_torch_adam.py \
	--ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \
	--torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

aws-torch-latest-full #54

Workflow file

aws-torch-latest-full #54

Uh oh!

Workflow file for this run