|
| 1 | +#!/bin/bash |
| 2 | +set -e |
| 3 | + |
| 4 | +# Neuron inf1 downgrade script |
| 5 | +# Detects inf1 instances and downgrades neuron driver to compatible version |
| 6 | + |
| 7 | +CACHE_DIR="/opt/ecs/neuron/inf1-rpms" |
| 8 | + |
| 9 | +# Log a message to stderr (systemd provides timestamps) |
| 10 | +# Args: message to log |
| 11 | +log() { |
| 12 | + echo "$*" >&2 |
| 13 | +} |
| 14 | + |
| 15 | +# Detect inf1 hardware using PCI device IDs |
| 16 | +# inf1 instances have Neuron devices with IDs: 0x7064, 0x7065, 0x7066, or 0x7067 |
| 17 | +# Returns: 0 if inf1 detected, 1 if not inf1 |
| 18 | +detect_inf1_hardware() { |
| 19 | + log "Detecting inf1 hardware via PCI devices" |
| 20 | + |
| 21 | + # Check for inf1 Neuron device IDs |
| 22 | + if lspci -n | grep -q "1d0f:\(7064\|7065\|7066\|7067\)"; then |
| 23 | + log "inf1 Neuron device detected" |
| 24 | + return 0 |
| 25 | + fi |
| 26 | + |
| 27 | + log "No inf1 Neuron devices found" |
| 28 | + return 1 |
| 29 | +} |
| 30 | + |
| 31 | +# Downgrade neuron packages to inf1-compatible versions |
| 32 | +# Uses cached RPM packages and locks ALL neuron package versions to prevent updates |
| 33 | +# Returns 0 on success, 1 on failure |
| 34 | +downgrade_neuron_packages() { |
| 35 | + log "Starting neuron package downgrade for inf1" |
| 36 | + |
| 37 | + # Find all cached RPM files |
| 38 | + local cached_rpms |
| 39 | + cached_rpms=$(find "$CACHE_DIR" -name "*.rpm" 2>/dev/null) |
| 40 | + |
| 41 | + if [[ -z "$cached_rpms" ]]; then |
| 42 | + log "ERROR: No cached inf1-compatible packages found in $CACHE_DIR" |
| 43 | + return 1 |
| 44 | + fi |
| 45 | + |
| 46 | + log "Found cached packages:" |
| 47 | + echo "$cached_rpms" | while read -r rpm; do |
| 48 | + log " $(basename "$rpm")" |
| 49 | + done |
| 50 | + |
| 51 | + # Process each cached RPM |
| 52 | + while IFS= read -r rpm_file; do |
| 53 | + [[ -n "$rpm_file" ]] || continue |
| 54 | + |
| 55 | + # Extract package name from RPM filename |
| 56 | + local package_name |
| 57 | + package_name=$(rpm -qp --queryformat '%{NAME}' "$rpm_file" 2>/dev/null) |
| 58 | + |
| 59 | + if [[ -z "$package_name" ]]; then |
| 60 | + log "WARNING: Could not determine package name for $rpm_file, skipping" |
| 61 | + continue |
| 62 | + fi |
| 63 | + |
| 64 | + log "Processing package: $package_name" |
| 65 | + |
| 66 | + # Check current version |
| 67 | + local current_version target_version |
| 68 | + current_version=$(rpm -q "$package_name" --queryformat '%{VERSION}' 2>/dev/null || echo "none") |
| 69 | + target_version=$(rpm -qp --queryformat '%{VERSION}' "$rpm_file" 2>/dev/null) |
| 70 | + |
| 71 | + log "Current $package_name version: $current_version" |
| 72 | + log "Target $package_name version: $target_version" |
| 73 | + |
| 74 | + # Skip if already at target version |
| 75 | + if [[ "$current_version" == "$target_version" ]]; then |
| 76 | + log "$package_name already at target version, skipping" |
| 77 | + continue |
| 78 | + fi |
| 79 | + |
| 80 | + # Remove current package |
| 81 | + log "Removing current $package_name" |
| 82 | + if ! rpm -e --nodeps "$package_name" 2>/dev/null; then |
| 83 | + log "WARNING: Failed to remove $package_name, may not be installed" |
| 84 | + fi |
| 85 | + |
| 86 | + # Install inf1-compatible version |
| 87 | + log "Installing inf1-compatible $package_name" |
| 88 | + if rpm -i "$rpm_file"; then |
| 89 | + log "$package_name downgrade successful" |
| 90 | + else |
| 91 | + log "ERROR: Failed to install inf1-compatible $package_name" |
| 92 | + return 1 |
| 93 | + fi |
| 94 | + done <<< "$cached_rpms" |
| 95 | + |
| 96 | + # Lock all known neuron packages to prevent partial updates |
| 97 | + local all_neuron_packages=("aws-neuronx-dkms" "aws-neuronx-tools" "aws-neuronx-oci-hook") |
| 98 | + log "Locking all neuron packages: ${all_neuron_packages[*]}" |
| 99 | + if dnf --cacheonly versionlock add "${all_neuron_packages[@]}"; then |
| 100 | + log "Package version locking successful" |
| 101 | + else |
| 102 | + log "WARNING: Failed to lock some packages" |
| 103 | + fi |
| 104 | + |
| 105 | + log "Neuron package downgrade completed successfully" |
| 106 | +} |
| 107 | + |
| 108 | +# Main function - orchestrates hardware detection and conditional downgrade |
| 109 | +# Exit code: 0 on success, 1 on failure |
| 110 | +main() { |
| 111 | + log "Starting neuron inf1 downgrade service" |
| 112 | + |
| 113 | + # Detect inf1 hardware |
| 114 | + if ! detect_inf1_hardware; then |
| 115 | + log "Non-inf1 hardware detected, no action needed" |
| 116 | + log "Neuron inf1 downgrade service completed" |
| 117 | + return 0 |
| 118 | + fi |
| 119 | + |
| 120 | + log "inf1 hardware detected, proceeding with downgrade" |
| 121 | + if ! downgrade_neuron_packages; then |
| 122 | + log "ERROR: Neuron package downgrade failed" |
| 123 | + return 1 |
| 124 | + fi |
| 125 | + |
| 126 | + log "Neuron inf1 downgrade service completed" |
| 127 | +} |
| 128 | + |
| 129 | +main "$@" |
0 commit comments