Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions NVIDA_DRIVER_VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# NVIDIA Driver Version Tracking
# ------------------------------
# IMPORTANT: This file is for INFORMATIONAL AND TRACKING PURPOSES ONLY.
#
# DO NOT EDIT THIS FILE MANUALLY. It is automatically updated by the
# check-update.sh script. Manual changes will be overwritten.
# Format: nvidia_driver_version_<ami_type> = "<version>"
#
# This file tracks the latest NVIDIA driver versions detected for different
# Amazon Linux AMIs. It does not affect the actual driver installations.
# For driver installations or updates, please refer to the appropriate
# documentation or automation scripts.

nvidia_driver_version_al2 = "550.163.01"
Comment thread
harishxr marked this conversation as resolved.
nvidia_driver_version_al2023 = "570.133.20"
50 changes: 43 additions & 7 deletions scripts/check-update-security.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ usage() {
echo " $0 AMI_PLATFORM"
echo "Example:"
echo " $0 al2_arm"
echo "AMI_PLATFORM Must be one of: al1, al2, al2_arm"
echo "AMI_PLATFORM Must be one of: al1, al2, al2_arm, al2_gpu, al2023_gpu"
}

error_msg() {
Expand All @@ -19,6 +19,8 @@ error_msg() {
AL1_PATH="/aws/service/ecs/optimized-ami/amazon-linux/recommended"
AL2_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2/recommended"
AL2_ARM_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2/arm64/recommended"
AL2_GPU_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2/gpu/recommended"
AL2023_GPU_PATH="/aws/service/ecs/optimized-ami/amazon-linux-2023/gpu/recommended"

# Indicates that an update exists
UPDATE_EXISTS_CODE="100"
Expand Down Expand Up @@ -67,6 +69,13 @@ case "$platform" in
ami_path=$AL2_ARM_PATH
instance_type="c6g.medium"
;;
"al2_gpu")
ami_path=$AL2_GPU_PATH
;;
"al2023_gpu")
ami_path=$AL2023_GPU_PATH
instance_type="g4dn.xlarge"
;;
*)
error_msg "Incorrect platform selection"
usage
Expand Down Expand Up @@ -122,7 +131,20 @@ instance_id=$(aws ec2 run-instances \
--user-data file://user_data.txt \
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='$platform-check-update-security'}]' |
jq -r '.Instances[0].InstanceId')
command_params='commands=["yum check-update --security --sec-severity=critical --exclude=nvidia*,docker*,cuda*,containerd*,runc* -q"]'

# check-update based on platform
if [ "$platform" = "al2_gpu" ]; then
# The amzn2-nvidia repository does not provide updateinfo metadata (updateinfo.xml),
# which YUM relies on to classify updates as security-related. The --security flag
# would not detect updates without this metadata. Therefore, we check for all updates
# to nvidia-driver packages and handle them as potential security updates.
command_params='commands=["yum check-update nvidia-driver-latest-dkms -q"]'
Comment thread
singholt marked this conversation as resolved.
elif [ "$platform" = "al2023_gpu" ]; then
# Run check-update in a loop to ensure that the repo metadata is up to date
command_params='commands=["for i in {1..5}; do dnf clean expire-cache; dnf --refresh check-upgrade nvidia-driver-cuda -q; code=$?; if [ $code -eq 100 ]; then exit 100; fi; sleep 5; done; exit 0"]'
else
command_params='commands=["yum check-update --security --sec-severity=critical --exclude=nvidia*,docker*,cuda*,containerd*,runc* -q"]'
fi

# Wait for instance status to reach ok, fail at timeout code
aws ec2 wait instance-running --instance-ids $instance_id
Expand Down Expand Up @@ -170,7 +192,7 @@ command_status() {
--query 'Status' \
--output text
}
max_retries=20
max_retries=25
success=0
for ((r = 0; r < max_retries; r++)); do
sleep 5
Expand All @@ -187,17 +209,31 @@ if [ $success -ne 1 ]; then
fi

# Get command output
cmd_response_code=$(aws ssm get-command-invocation \
cmd_output=$(aws ssm get-command-invocation \
--command-id $cmd_id \
--instance-id $instance_id |
jq -r '.ResponseCode')
--instance-id $instance_id)

cmd_response_code=$(echo "$cmd_output" | jq -r '.ResponseCode')
std_output=$(echo "$cmd_output" | jq -r '.StandardOutputContent')

# Delete the instance
terminate_out=$(aws ec2 terminate-instances --instance-ids $instance_id)

# Return whether update is necessary
if [ "$cmd_response_code" -eq "$UPDATE_EXISTS_CODE" ]; then
echo "true"
if [ "$platform" = "al2_gpu" ]; then
nvidia_driver_version=$(echo "$std_output" | grep "nvidia-driver-latest-dkms" | awk '{print $2}' | cut -d'-' -f1 | sed 's/^[0-9]://')
if [ -n "$nvidia_driver_version" ]; then
echo "true $nvidia_driver_version"
Comment thread
ShelbyZ marked this conversation as resolved.
else
echo "true"
fi
elif [ "$platform" = "al2023_gpu" ]; then
nvidia_driver_version=$(echo "$std_output" | grep "nvidia-driver-cuda" | awk '{print $2}' | cut -d'-' -f1 | sed 's/^[0-9]://')
echo "true $nvidia_driver_version"
else
echo "true"
fi
elif [ "$cmd_response_code" -ne "$SUCCESS_CODE" ]; then
# If update doesn't exist and there was a fail code, something went wrong
echo "Unknown issue with the command execution"
Expand Down
65 changes: 60 additions & 5 deletions scripts/check-update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,31 @@ error() {
exit 1
}

# Function to handle NVIDIA driver version extraction and storage
handle_nvidia_version() {
local ami_variant=$1
local gpu_update=$2

# Skip if not a GPU-supported AMI type
if [[ $ami_variant != "al2" && $ami_variant != "al2023" ]]; then
return
fi

local version=""
local version_key="nvidia_driver_version_${ami_variant}"

if [[ $gpu_update == true* ]]; then
version=$(echo "$gpu_update" | cut -d' ' -f2)
fi

# Update version entry if version is available and file exists
if [ -n "$version" ] && [ -f NVIDIA_DRIVER_VERSION ]; then
if grep -q "^${version_key} = " NVIDIA_DRIVER_VERSION; then
sed -i "s/^${version_key} = .*/${version_key} = \"${version}\"/" NVIDIA_DRIVER_VERSION
fi
fi
}

readonly ami_type="$1"
if [ -z "$ami_type" ]; then
error "AMI_TYPE must be provided"
Expand All @@ -26,15 +51,41 @@ cp release-$ami_type.auto.pkrvars.hcl release-$ami_type.old.hcl
set +e
diff_val=$(diff <(grep -v ami_version release-$ami_type.old.hcl) <(grep -v ami_version release-$ami_type.auto.pkrvars.hcl))
set -e

# Check for NVIDIA driver version for both AL2 and AL2023
if [ "$ami_type" = "al2" ] || [ "$ami_type" = "al2023" ]; then
gpu_update=$(./scripts/check-update-security.sh "${ami_type}_gpu")
handle_nvidia_version "$ami_type" "$gpu_update"
if [[ $gpu_update == true* ]]; then
Update="true"
fi
fi

# If no difference in dependencies, check for security update
if [ -z "$diff_val" ]; then
# al2023 version already generates a diff in dependency file if it has security updates, so no check necessary if al2023
if [ "$ami_type" != "al2023" ]; then
Update="false"
case "$ami_type" in
"al2023")
# AL2023 version already generates a diff in dependency file if it has security updates, so no check necessary if AL2023
;;
"al1")
Update=$(./scripts/check-update-security.sh $ami_type)
if [ "$Update" != "true" ] && [ "$ami_type" != "al1" ]; then
Update=$(./scripts/check-update-security.sh "$ami_type"_arm)
;;
"al2")
# Check all AL2 variants
amd_update=$(./scripts/check-update-security.sh $ami_type)
arm_update=$(./scripts/check-update-security.sh "${ami_type}_arm")

# Combine results
if [[ $amd_update == true* ]] || [[ $arm_update == true* ]]; then
Update="true"
fi
fi
;;
*)
echo "Error: Invalid AMI type: $ami_type"
exit 1
;;
esac
else
Update="true"
fi
Expand All @@ -44,6 +95,10 @@ rm "release-$ami_type.old.hcl"
if [ "$Update" = "true" ]; then
echo "Update exists for $ami_type"
git add release-$ami_type.auto.pkrvars.hcl
if [ -f NVIDIA_DRIVER_VERSION ] && ! git diff --quiet NVIDIA_DRIVER_VERSION; then
echo "NVIDIA driver version changes detected"
git add NVIDIA_DRIVER_VERSION
fi
else
echo "Update does not exist for $ami_type"
fi