Skip to content

Commit 856640b

Browse files
pin nvidia major ver
1 parent 55e6bb8 commit 856640b

6 files changed

Lines changed: 100 additions & 57 deletions

File tree

NVIDIA_DRIVER_VERSION

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
# NVIDIA Driver and CUDA Version Tracking
22
# ----------------------------------------
3-
# IMPORTANT: This file is for INFORMATIONAL AND TRACKING PURPOSES ONLY.
3+
# IMPORTANT: DO NOT EDIT THIS FILE MANUALLY. It is automatically updated
4+
# by the check-update.sh script. Manual changes will be overwritten.
45
#
5-
# DO NOT EDIT THIS FILE MANUALLY. It is automatically updated by the
6-
# check-update.sh script. Manual changes will be overwritten.
76
# Format: nvidia_driver_version_<ami_type> = "<version>"
87
# cuda_version_<ami_type> = "<version>" (AL2 only)
98
#
10-
# This file tracks the latest NVIDIA driver and CUDA versions detected for different
11-
# Amazon Linux AMIs. It does not affect the actual driver installations.
12-
# For driver installations or updates, please refer to the appropriate
13-
# documentation or automation scripts.
9+
# This file tracks the latest NVIDIA driver and CUDA versions detected for
10+
# different Amazon Linux AMIs. For AL2023 GPU, the version in this file is
11+
# used by the install script during AMI builds (sourced via Packer file
12+
# provisioner). The pinned major version is defined in variables.pkr.hcl.
1413

1514
nvidia_driver_version_al2 = "550.163.01"
1615
nvidia_driver_version_al2023 = "580.126.09"

al2023.pkr.hcl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,12 @@ build {
200200
only = ["amazon-ebs.al2023gpu"]
201201
}
202202

203+
provisioner "file" {
204+
source = "NVIDIA_DRIVER_VERSION"
205+
destination = "/tmp/NVIDIA_DRIVER_VERSION"
206+
only = ["amazon-ebs.al2023gpu"]
207+
}
208+
203209
provisioner "shell" {
204210
environment_vars = [
205211
"AMI_TYPE=${source.name}",

scripts/al2023/gpu/install-nvidia-driver.sh

Lines changed: 11 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,17 @@ fi
4747
# open, and GRID) in /var/lib/dkms-archive. All three must be on the same driver
4848
# version to ensure proper functionality. The GRID kmod comes from the EC2 GRID
4949
# .run file in S3, while proprietary and open come from the AL2023 nvidia repo.
50-
# If the repo and S3 versions differ, we use the lower of the two to ensure both
51-
# sources can provide it.
50+
#
51+
# The exact driver version is determined by the security check script
52+
# (check-update-security.sh) as min(repo, S3 GRID) within the pinned major,
53+
# and tracked in the NVIDIA_DRIVER_VERSION file uploaded by Packer.
54+
55+
NVIDIA_DRIVER_FULL_VERSION=$(grep "^nvidia_driver_version_al2023" /tmp/NVIDIA_DRIVER_VERSION | awk -F'"' '{print $2}')
56+
if [[ -z "$NVIDIA_DRIVER_FULL_VERSION" ]]; then
57+
echo "ERROR: Could not read nvidia_driver_version_al2023 from /tmp/NVIDIA_DRIVER_VERSION"
58+
exit 1
59+
fi
60+
echo "Using NVIDIA driver version: ${NVIDIA_DRIVER_FULL_VERSION}"
5261

5362
# Some regions do not have access to the GRID driver S3 bucket, skip GRID driver
5463
skip_grid_driver=""
@@ -59,38 +68,6 @@ fi
5968

6069
EC2_GRID_DRIVER_S3_BUCKET="ec2-linux-nvidia-drivers"
6170

62-
if [[ -z "$skip_grid_driver" ]]; then
63-
LATEST_GRID_DRIVER_VERSION=$(aws s3 ls --recursive s3://${EC2_GRID_DRIVER_S3_BUCKET}/ --no-sign-request \
64-
| grep -Eo "(NVIDIA-Linux-x86_64-)[0-9]+\.[0-9]+\.[0-9]+(-grid-aws\.run)" \
65-
| cut -d'-' -f4 \
66-
| sort -V \
67-
| tail -1)
68-
69-
if [[ -z "$LATEST_GRID_DRIVER_VERSION" ]]; then
70-
echo "ERROR: Could not determine NVIDIA GRID driver version from S3"
71-
exit 1
72-
fi
73-
echo "Latest GRID .run version in S3: ${LATEST_GRID_DRIVER_VERSION}"
74-
fi
75-
76-
LATEST_OPEN_MODULE_VERSION=$(dnf repoquery --latest=1 --arch=noarch --queryformat "%{version}" "kmod-nvidia-open-dkms" 2>/dev/null | sort -V | tail -1)
77-
78-
if [[ -z "$LATEST_OPEN_MODULE_VERSION" ]]; then
79-
echo "ERROR: Could not determine NVIDIA open module version from repo"
80-
exit 1
81-
fi
82-
echo "Latest open kmod version in repo: ${LATEST_OPEN_MODULE_VERSION}"
83-
84-
if [[ -n "$skip_grid_driver" ]]; then
85-
# No GRID driver available in this region, use open module version directly
86-
NVIDIA_DRIVER_FULL_VERSION="$LATEST_OPEN_MODULE_VERSION"
87-
else
88-
# Use the lower version to ensure both sources can provide it
89-
NVIDIA_DRIVER_FULL_VERSION=$(printf '%s\n%s\n' "$LATEST_GRID_DRIVER_VERSION" "$LATEST_OPEN_MODULE_VERSION" | sort -V | head -1)
90-
fi
91-
92-
echo "Selected NVIDIA driver version: ${NVIDIA_DRIVER_FULL_VERSION}"
93-
9471
### Kernel Module Archive Functions ###
9572
# These functions pre-compile and archive different NVIDIA driver variants
9673
# This allows runtime switching between proprietary, open-source, and GRID drivers

scripts/al2023/gpu/nvidia-kmod-load.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ readonly NVIDIA_GRID_SUBDEVICES=(
1616
)
1717
readonly NVIDIA_PROPRIETARY_SUBDEVICES=(
1818
"1db1:1212" # P3 instances
19+
"1db5:1249" # P3dn instances
1920
"13f2:113a" # G3 instances
2021
)
2122

scripts/check-update-security.sh

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,29 @@ instance_id=$(aws ec2 run-instances \
139139
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='$platform-check-update-security'}]' |
140140
jq -r '.Instances[0].InstanceId')
141141

142+
# Read pinned major version for AL2023 GPU filtering
143+
pinned_major=""
144+
if [ "$platform" = "al2023_gpu" ]; then
145+
pinned_major=$(sed -n '/variable "nvidia_driver_major_al2023" {/,/}/p' variables.pkr.hcl | grep "default" | awk -F '"' '{ print $2 }')
146+
if [ -z "$pinned_major" ]; then
147+
echo "ERROR: Could not read nvidia_driver_major_al2023 from variables.pkr.hcl"
148+
exit 1
149+
fi
150+
fi
151+
142152
# check-update based on platform
143153
if [[ $platform == al2023* ]]; then
144154
check_upgrade_options="--sec-severity Critical --exclude=$EXCLUDE_SEC_UPDATES_PKGS"
145155
if [[ $platform == *gpu ]]; then
146-
check_upgrade_options="nvidia-driver-cuda"
156+
# dnf check-upgrade only reports the single latest version across all majors,
157+
# so it can't detect updates within a pinned major. Instead, query the installed
158+
# version and the latest available within the pinned major, then compare locally.
159+
gpu_cmd_installed="dnf repoquery --installed --arch=x86_64 --queryformat '%{version}' nvidia-driver-cuda"
160+
gpu_cmd_latest="dnf repoquery --disableplugin=versionlock --arch=x86_64 --queryformat '%{version}' nvidia-driver-cuda | grep '^${pinned_major}[.]' | sort -V | tail -1"
161+
command_params="commands=[\"echo INSTALLED=\$(${gpu_cmd_installed})\",\"echo LATEST=\$(${gpu_cmd_latest})\"]"
162+
else
163+
command_params="commands=[\"dnf --refresh check-upgrade --releasever=latest --disableplugin=versionlock $check_upgrade_options -q\"]"
147164
fi
148-
command_params="commands=[\"dnf --refresh check-upgrade --releasever=latest --disableplugin=versionlock $check_upgrade_options -q\"]"
149165
elif [ "$platform" = "al2_gpu" ]; then
150166
# The amzn2-nvidia repository does not provide updateinfo metadata (updateinfo.xml),
151167
# which YUM relies on to classify updates as security-related. The --security flag
@@ -229,6 +245,55 @@ std_output=$(echo "$cmd_output" | jq -r '.StandardOutputContent')
229245
# Delete the instance
230246
terminate_out=$(aws ec2 terminate-instances --instance-ids $instance_id)
231247

248+
# AL2023 GPU uses repoquery instead of check-upgrade, handle separately
249+
if [ "$platform" = "al2023_gpu" ]; then
250+
if [ "$cmd_response_code" -ne "$SUCCESS_CODE" ]; then
251+
echo "Unknown issue with the command execution"
252+
exit 1
253+
fi
254+
255+
installed_version=$(echo "$std_output" | grep "^INSTALLED=" | cut -d'=' -f2)
256+
latest_repo_version=$(echo "$std_output" | grep "^LATEST=" | cut -d'=' -f2)
257+
258+
if [ -z "$installed_version" ] || [ -z "$latest_repo_version" ]; then
259+
echo "ERROR: Could not determine installed or latest NVIDIA driver version"
260+
exit 1
261+
fi
262+
263+
# Compare installed vs latest within pinned major
264+
newer=$(printf '%s\n%s' "$installed_version" "$latest_repo_version" | sort -V | tail -1)
265+
if [ "$newer" = "$installed_version" ]; then
266+
echo "false"
267+
exit 0
268+
fi
269+
270+
# The AMI build installs min(repo, S3 GRID .run), so check the GRID bucket
271+
# to determine the actual version that would be installed.
272+
grid_driver_version=$(aws s3 ls --recursive s3://ec2-linux-nvidia-drivers/ --no-sign-request |
273+
grep -Eo "(NVIDIA-Linux-x86_64-)[0-9]+\.[0-9]+\.[0-9]+(-grid-aws\.run)" |
274+
cut -d'-' -f4 |
275+
grep "^${pinned_major}\." |
276+
sort -V |
277+
tail -1)
278+
if [ -z "$grid_driver_version" ]; then
279+
echo "ERROR: Could not determine NVIDIA GRID driver version from S3 for major ${pinned_major}"
280+
exit 1
281+
fi
282+
283+
# Use min(repo, GRID) as the effective version, same as the install script
284+
effective_version=$(printf '%s\n%s' "$latest_repo_version" "$grid_driver_version" | sort -V | head -1)
285+
286+
# Only trigger a release if the effective version is newer than installed
287+
newer=$(printf '%s\n%s' "$installed_version" "$effective_version" | sort -V | tail -1)
288+
if [ "$newer" = "$installed_version" ]; then
289+
echo "false"
290+
exit 0
291+
fi
292+
293+
echo "true $effective_version"
294+
exit 0
295+
fi
296+
232297
# Return whether update is necessary
233298
if [ "$cmd_response_code" -eq "$UPDATE_EXISTS_CODE" ]; then
234299
if [ "$platform" = "al2_gpu" ]; then
@@ -259,20 +324,9 @@ if [ "$cmd_response_code" -eq "$UPDATE_EXISTS_CODE" ]; then
259324
;;
260325
esac
261326
elif [ "$platform" = "al2023_gpu" ]; then
262-
nvidia_driver_version=$(echo "$std_output" | grep "nvidia-driver-cuda" | awk '{print $2}' | cut -d'-' -f1 | sed 's/^[0-9]://')
263-
# The AMI build pins to min(repo, S3 GRID .run) so all three driver
264-
# variants can be built at the same version. Mirror that logic here.
265-
grid_driver_version=$(aws s3 ls --recursive s3://ec2-linux-nvidia-drivers/ --no-sign-request |
266-
grep -Eo "(NVIDIA-Linux-x86_64-)[0-9]+\.[0-9]+\.[0-9]+(-grid-aws\.run)" |
267-
cut -d'-' -f4 |
268-
sort -V |
269-
tail -1)
270-
if [ -z "$grid_driver_version" ]; then
271-
echo "ERROR: Could not determine NVIDIA GRID driver version from S3"
272-
exit 1
273-
fi
274-
nvidia_driver_version=$(printf '%s\n%s\n' "$nvidia_driver_version" "$grid_driver_version" | sort -V | head -1)
275-
echo "true $nvidia_driver_version"
327+
# This path should not be reached; al2023_gpu is handled above via repoquery
328+
echo "ERROR: Unexpected al2023_gpu in check-upgrade result path"
329+
exit 1
276330
else
277331
echo "true"
278332
fi

variables.pkr.hcl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,3 +258,9 @@ variable "custom_endpoint_ec2" {
258258
description = "Custom EC2 endpoint to use for building AMIs"
259259
default = ""
260260
}
261+
262+
variable "nvidia_driver_major_al2023" {
263+
type = string
264+
description = "Pinned NVIDIA driver major version for AL2023 GPU AMIs. Only driver versions within this major will be installed."
265+
default = "580"
266+
}

0 commit comments

Comments
 (0)