diff --git a/NVIDA_DRIVER_VERSION b/NVIDA_DRIVER_VERSION new file mode 100644 index 00000000..0b6e1ffc --- /dev/null +++ b/NVIDA_DRIVER_VERSION @@ -0,0 +1,15 @@ +# NVIDIA Driver Version Tracking +# ------------------------------ +# IMPORTANT: This file is for INFORMATIONAL AND TRACKING PURPOSES ONLY. +# +# DO NOT EDIT THIS FILE MANUALLY. It is automatically updated by the +# check-update.sh script. Manual changes will be overwritten. +# Format: nvidia_driver_version_ = "" +# +# This file tracks the latest NVIDIA driver versions detected for different +# Amazon Linux AMIs. It does not affect the actual driver installations. +# For driver installations or updates, please refer to the appropriate +# documentation or automation scripts. + +nvidia_driver_version_al2 = "550.163.01" +nvidia_driver_version_al2023 = "570.133.20" diff --git a/scripts/check-update-security.sh b/scripts/check-update-security.sh index 93c4124d..8b2724bf 100755 --- a/scripts/check-update-security.sh +++ b/scripts/check-update-security.sh @@ -7,7 +7,7 @@ usage() { echo " $0 AMI_PLATFORM" echo "Example:" echo " $0 al2_arm" - echo "AMI_PLATFORM Must be one of: al1, al2, al2_arm" + echo "AMI_PLATFORM Must be one of: al1, al2, al2_arm, al2_gpu, al2023_gpu" } error_msg() { @@ -19,6 +19,8 @@ error_msg() { AL1_PATH="/aws/service/ecs/optimized-ami/amazon-linux/recommended" AL2_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2/recommended" AL2_ARM_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2/arm64/recommended" +AL2_GPU_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2/gpu/recommended" +AL2023_GPU_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2023/gpu/recommended" # Indicates that an update exists UPDATE_EXISTS_CODE="100" @@ -67,6 +69,13 @@ case "$platform" in ami_path=$AL2_ARM_PATH instance_type="c6g.medium" ;; +"al2_gpu") + ami_path=$AL2_GPU_PATH + ;; +"al2023_gpu") + ami_path=$AL2023_GPU_PATH + instance_type="g4dn.xlarge" + ;; *) error_msg "Incorrect platform selection" usage @@ -122,7 +131,20 @@ instance_id=$(aws ec2 run-instances \ --user-data file://user_data.txt \ --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='$platform-check-update-security'}]' | jq -r '.Instances[0].InstanceId') -command_params='commands=["yum check-update --security --sec-severity=critical --exclude=nvidia*,docker*,cuda*,containerd*,runc* -q"]' + +# check-update based on platform +if [ "$platform" = "al2_gpu" ]; then + # The amzn2-nvidia repository does not provide updateinfo metadata (updateinfo.xml), + # which YUM relies on to classify updates as security-related. The --security flag + # would not detect updates without this metadata. Therefore, we check for all updates + # to nvidia-driver packages and handle them as potential security updates. + command_params='commands=["yum check-update nvidia-driver-latest-dkms -q"]' +elif [ "$platform" = "al2023_gpu" ]; then + # Run check-update in a loop to ensure that the repo metadata is up to date + command_params='commands=["for i in {1..5}; do dnf clean expire-cache; dnf --refresh check-upgrade nvidia-driver-cuda -q; code=$?; if [ $code -eq 100 ]; then exit 100; fi; sleep 5; done; exit 0"]' +else + command_params='commands=["yum check-update --security --sec-severity=critical --exclude=nvidia*,docker*,cuda*,containerd*,runc* -q"]' +fi # Wait for instance status to reach ok, fail at timeout code aws ec2 wait instance-running --instance-ids $instance_id @@ -170,7 +192,7 @@ command_status() { --query 'Status' \ --output text } -max_retries=20 +max_retries=25 success=0 for ((r = 0; r < max_retries; r++)); do sleep 5 @@ -187,17 +209,31 @@ if [ $success -ne 1 ]; then fi # Get command output -cmd_response_code=$(aws ssm get-command-invocation \ +cmd_output=$(aws ssm get-command-invocation \ --command-id $cmd_id \ - --instance-id $instance_id | - jq -r '.ResponseCode') + --instance-id $instance_id) + +cmd_response_code=$(echo "$cmd_output" | jq -r '.ResponseCode') +std_output=$(echo "$cmd_output" | jq -r '.StandardOutputContent') # Delete the instance terminate_out=$(aws ec2 terminate-instances --instance-ids $instance_id) # Return whether update is necessary if [ "$cmd_response_code" -eq "$UPDATE_EXISTS_CODE" ]; then - echo "true" + if [ "$platform" = "al2_gpu" ]; then + nvidia_driver_version=$(echo "$std_output" | grep "nvidia-driver-latest-dkms" | awk '{print $2}' | cut -d'-' -f1 | sed 's/^[0-9]://') + if [ -n "$nvidia_driver_version" ]; then + echo "true $nvidia_driver_version" + else + echo "true" + fi + elif [ "$platform" = "al2023_gpu" ]; then + nvidia_driver_version=$(echo "$std_output" | grep "nvidia-driver-cuda" | awk '{print $2}' | cut -d'-' -f1 | sed 's/^[0-9]://') + echo "true $nvidia_driver_version" + else + echo "true" + fi elif [ "$cmd_response_code" -ne "$SUCCESS_CODE" ]; then # If update doesn't exist and there was a fail code, something went wrong echo "Unknown issue with the command execution" diff --git a/scripts/check-update.sh b/scripts/check-update.sh index 48f02316..3e71293c 100755 --- a/scripts/check-update.sh +++ b/scripts/check-update.sh @@ -16,6 +16,31 @@ error() { exit 1 } +# Function to handle NVIDIA driver version extraction and storage +handle_nvidia_version() { + local ami_variant=$1 + local gpu_update=$2 + + # Skip if not a GPU-supported AMI type + if [[ $ami_variant != "al2" && $ami_variant != "al2023" ]]; then + return + fi + + local version="" + local version_key="nvidia_driver_version_${ami_variant}" + + if [[ $gpu_update == true* ]]; then + version=$(echo "$gpu_update" | cut -d' ' -f2) + fi + + # Update version entry if version is available and file exists + if [ -n "$version" ] && [ -f NVIDIA_DRIVER_VERSION ]; then + if grep -q "^${version_key} = " NVIDIA_DRIVER_VERSION; then + sed -i "s/^${version_key} = .*/${version_key} = \"${version}\"/" NVIDIA_DRIVER_VERSION + fi + fi +} + readonly ami_type="$1" if [ -z "$ami_type" ]; then error "AMI_TYPE must be provided" @@ -26,15 +51,41 @@ cp release-$ami_type.auto.pkrvars.hcl release-$ami_type.old.hcl set +e diff_val=$(diff <(grep -v ami_version release-$ami_type.old.hcl) <(grep -v ami_version release-$ami_type.auto.pkrvars.hcl)) set -e + +# Check for NVIDIA driver version for both AL2 and AL2023 +if [ "$ami_type" = "al2" ] || [ "$ami_type" = "al2023" ]; then + gpu_update=$(./scripts/check-update-security.sh "${ami_type}_gpu") + handle_nvidia_version "$ami_type" "$gpu_update" + if [[ $gpu_update == true* ]]; then + Update="true" + fi +fi + # If no difference in dependencies, check for security update if [ -z "$diff_val" ]; then - # al2023 version already generates a diff in dependency file if it has security updates, so no check necessary if al2023 - if [ "$ami_type" != "al2023" ]; then + Update="false" + case "$ami_type" in + "al2023") + # AL2023 version already generates a diff in dependency file if it has security updates, so no check necessary if AL2023 + ;; + "al1") Update=$(./scripts/check-update-security.sh $ami_type) - if [ "$Update" != "true" ] && [ "$ami_type" != "al1" ]; then - Update=$(./scripts/check-update-security.sh "$ami_type"_arm) + ;; + "al2") + # Check all AL2 variants + amd_update=$(./scripts/check-update-security.sh $ami_type) + arm_update=$(./scripts/check-update-security.sh "${ami_type}_arm") + + # Combine results + if [[ $amd_update == true* ]] || [[ $arm_update == true* ]]; then + Update="true" fi - fi + ;; + *) + echo "Error: Invalid AMI type: $ami_type" + exit 1 + ;; + esac else Update="true" fi @@ -44,6 +95,10 @@ rm "release-$ami_type.old.hcl" if [ "$Update" = "true" ]; then echo "Update exists for $ami_type" git add release-$ami_type.auto.pkrvars.hcl + if [ -f NVIDIA_DRIVER_VERSION ] && ! git diff --quiet NVIDIA_DRIVER_VERSION; then + echo "NVIDIA driver version changes detected" + git add NVIDIA_DRIVER_VERSION + fi else echo "Update does not exist for $ami_type" fi