Skip to content

Commit d786b77

Browse files
authored
Add dynamic neuron driver downgrade for al2023neu AMI on inf1 instances (#596)
1 parent c9283ad commit d786b77

4 files changed

Lines changed: 174 additions & 2 deletions

File tree

al2023.pkr.hcl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,18 @@ build {
165165
]
166166
}
167167

168+
provisioner "file" {
169+
source = "scripts/al2023/neuron/neuron-inf1-downgrade.sh"
170+
destination = "/tmp/neuron-inf1-downgrade.sh"
171+
only = ["amazon-ebs.al2023neu"]
172+
}
173+
174+
provisioner "file" {
175+
source = "scripts/al2023/neuron/neuron-inf1-downgrade.service"
176+
destination = "/tmp/neuron-inf1-downgrade.service"
177+
only = ["amazon-ebs.al2023neu"]
178+
}
179+
168180
provisioner "shell" {
169181
environment_vars = [
170182
"AMI_TYPE=${source.name}"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[Unit]
2+
Description=Neuron SDK downgrade for inf1 instances
3+
Before=cloud-init.service ecs.service
4+
5+
[Service]
6+
Type=oneshot
7+
ExecStart=/var/lib/ecs/scripts/neuron-inf1-downgrade.sh
8+
RemainAfterExit=yes
9+
10+
[Install]
11+
WantedBy=multi-user.target
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Neuron inf1 downgrade script
5+
# Detects inf1 instances and downgrades neuron driver to compatible version
6+
7+
CACHE_DIR="/opt/ecs/neuron/inf1-rpms"
8+
9+
# Log a message to stderr (systemd provides timestamps)
10+
# Args: message to log
11+
log() {
12+
echo "$*" >&2
13+
}
14+
15+
# Detect inf1 hardware using PCI device IDs
16+
# inf1 instances have Neuron devices with IDs: 0x7064, 0x7065, 0x7066, or 0x7067
17+
# Returns: 0 if inf1 detected, 1 if not inf1
18+
detect_inf1_hardware() {
19+
log "Detecting inf1 hardware via PCI devices"
20+
21+
# Check for inf1 Neuron device IDs
22+
if lspci -n | grep -q "1d0f:\(7064\|7065\|7066\|7067\)"; then
23+
log "inf1 Neuron device detected"
24+
return 0
25+
fi
26+
27+
log "No inf1 Neuron devices found"
28+
return 1
29+
}
30+
31+
# Downgrade neuron packages to inf1-compatible versions
32+
# Uses cached RPM packages and locks ALL neuron package versions to prevent updates
33+
# Returns 0 on success, 1 on failure
34+
downgrade_neuron_packages() {
35+
log "Starting neuron package downgrade for inf1"
36+
37+
# Find all cached RPM files
38+
local cached_rpms
39+
cached_rpms=$(find "$CACHE_DIR" -name "*.rpm" 2>/dev/null)
40+
41+
if [[ -z "$cached_rpms" ]]; then
42+
log "ERROR: No cached inf1-compatible packages found in $CACHE_DIR"
43+
return 1
44+
fi
45+
46+
log "Found cached packages:"
47+
echo "$cached_rpms" | while read -r rpm; do
48+
log " $(basename "$rpm")"
49+
done
50+
51+
# Process each cached RPM
52+
while IFS= read -r rpm_file; do
53+
[[ -n "$rpm_file" ]] || continue
54+
55+
# Extract package name from RPM filename
56+
local package_name
57+
package_name=$(rpm -qp --queryformat '%{NAME}' "$rpm_file" 2>/dev/null)
58+
59+
if [[ -z "$package_name" ]]; then
60+
log "WARNING: Could not determine package name for $rpm_file, skipping"
61+
continue
62+
fi
63+
64+
log "Processing package: $package_name"
65+
66+
# Check current version
67+
local current_version target_version
68+
current_version=$(rpm -q "$package_name" --queryformat '%{VERSION}' 2>/dev/null || echo "none")
69+
target_version=$(rpm -qp --queryformat '%{VERSION}' "$rpm_file" 2>/dev/null)
70+
71+
log "Current $package_name version: $current_version"
72+
log "Target $package_name version: $target_version"
73+
74+
# Skip if already at target version
75+
if [[ "$current_version" == "$target_version" ]]; then
76+
log "$package_name already at target version, skipping"
77+
continue
78+
fi
79+
80+
# Remove current package
81+
log "Removing current $package_name"
82+
if ! rpm -e --nodeps "$package_name" 2>/dev/null; then
83+
log "WARNING: Failed to remove $package_name, may not be installed"
84+
fi
85+
86+
# Install inf1-compatible version
87+
log "Installing inf1-compatible $package_name"
88+
if rpm -i "$rpm_file"; then
89+
log "$package_name downgrade successful"
90+
else
91+
log "ERROR: Failed to install inf1-compatible $package_name"
92+
return 1
93+
fi
94+
done <<< "$cached_rpms"
95+
96+
# Lock all known neuron packages to prevent partial updates
97+
local all_neuron_packages=("aws-neuronx-dkms" "aws-neuronx-tools" "aws-neuronx-oci-hook")
98+
log "Locking all neuron packages: ${all_neuron_packages[*]}"
99+
if dnf --cacheonly versionlock add "${all_neuron_packages[@]}"; then
100+
log "Package version locking successful"
101+
else
102+
log "WARNING: Failed to lock some packages"
103+
fi
104+
105+
log "Neuron package downgrade completed successfully"
106+
}
107+
108+
# Main function - orchestrates hardware detection and conditional downgrade
109+
# Exit code: 0 on success, 1 on failure
110+
main() {
111+
log "Starting neuron inf1 downgrade service"
112+
113+
# Detect inf1 hardware
114+
if ! detect_inf1_hardware; then
115+
log "Non-inf1 hardware detected, no action needed"
116+
log "Neuron inf1 downgrade service completed"
117+
return 0
118+
fi
119+
120+
log "inf1 hardware detected, proceeding with downgrade"
121+
if ! downgrade_neuron_packages; then
122+
log "ERROR: Neuron package downgrade failed"
123+
return 1
124+
fi
125+
126+
log "Neuron inf1 downgrade service completed"
127+
}
128+
129+
main "$@"

scripts/enable-ecs-agent-inferentia-support.sh

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,37 @@ EOF
2323

2424
sudo mv /tmp/neuron.repo /etc/yum.repos.d/neuron.repo
2525

26+
# Install inf1 downgrade support for al2023neu (files copied to /tmp by packer)
27+
if [[ $AMI_TYPE == "al2023neu" ]]; then
28+
sudo dnf install -y 'dnf-command(versionlock)'
29+
30+
# Install downgrade script and systemd service for inf1 compatibility
31+
sudo mkdir -p /var/lib/ecs/scripts/
32+
sudo cp /tmp/neuron-inf1-downgrade.sh /var/lib/ecs/scripts/
33+
sudo chmod +x /var/lib/ecs/scripts/neuron-inf1-downgrade.sh
34+
sudo cp /tmp/neuron-inf1-downgrade.service /etc/systemd/system/
35+
sudo systemctl enable neuron-inf1-downgrade.service
36+
fi
37+
2638
# Install OS headers
2739
sudo yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y
2840

2941
# Install Neuron Driver
3042
if [[ $AMI_TYPE == "al2inf" ]]; then
3143
# Pin the aws-neuronx-dkms package version to 2.17.17.0 only for al2inf, since the newest versions of the Neuron SDK are no longer supporting linux kernel 4.14
3244
sudo yum install -y aws-neuronx-dkms-2.17.17.0
33-
else
34-
# Pin the aws-neuron-dkms package version to 2.21* for al2kernel5dot10inf and al2023neu
45+
elif [[ $AMI_TYPE == "al2kernel5dot10inf" ]]; then
46+
# Pin the aws-neuron-dkms package version to 2.21* for legacy al2kernel5dot10inf
3547
# Refer: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/announcements/neuron2.x/announce-eos-neuron-driver-support-inf1.html
3648
sudo yum install -y aws-neuronx-dkms-2.21.*
49+
else
50+
# For al2023neu and future AMI types: prepare inf1 downgrade packages and install latest
51+
sudo mkdir -p /opt/ecs/neuron/inf1-rpms
52+
sudo chmod 755 /opt/ecs/neuron/inf1-rpms
53+
cd /opt/ecs/neuron/inf1-rpms
54+
sudo dnf download aws-neuronx-dkms-2.21.*
55+
cd -
56+
sudo yum install -y aws-neuronx-dkms
3757
fi
3858
sudo yum install -y aws-neuronx-oci-hook-2.*
3959

0 commit comments

Comments
 (0)