Skip to content

Commit 2d5ad13

Browse files
Harish Senthilkumarharishxr
authored andcommitted
Add kmod-util script for AL2023 GPU driver kernel-module management
1 parent 435ebe7 commit 2d5ad13

1 file changed

Lines changed: 217 additions & 0 deletions

File tree

scripts/al2023/gpu/kmod-util

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
#!/usr/bin/env bash
2+
3+
set -Eeuo pipefail
4+
5+
# Enable nullglob to handle empty glob patterns safely
6+
shopt -s nullglob
7+
8+
# dkms may not be on the PATH. Discover the path from known paths
9+
DKMS=""
10+
for path in /usr/bin/dkms /usr/sbin/dkms; do
11+
if [ -x "$path" ]; then
12+
DKMS=$path
13+
break
14+
fi
15+
done
16+
17+
if [ -z "$DKMS" ]; then
18+
echo >&2 "$(date '+%Y-%m-%dT%H:%M:%S%z')" "[kmod-util]" "ERROR: dkms not found"
19+
exit 1
20+
fi
21+
22+
# Custom directory for storing DKMS module archives
23+
# This path is used to store compressed kernel module tarballs that can be loaded on demand
24+
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive
25+
26+
LOCK_FILE=/var/lock/kmod-util.lock
27+
28+
function log() {
29+
echo >&2 "$(date '+%Y-%m-%dT%H:%M:%S%z')" "[kmod-util]" "$@"
30+
}
31+
32+
# lock mechanism to prevent concurrent operations
33+
function acquire_lock() {
34+
local timeout=60
35+
local count=0
36+
37+
while [ $count -lt $timeout ]; do
38+
if (set -C; echo $$ > "$LOCK_FILE") 2>/dev/null; then
39+
return 0
40+
fi
41+
sleep 1
42+
count=$((count + 1))
43+
done
44+
45+
log "ERROR: Failed to acquire lock after $timeout seconds"
46+
return 1
47+
}
48+
49+
function release_lock() {
50+
rm -f "$LOCK_FILE"
51+
}
52+
53+
# Function to detect NVIDIA GPU devices
54+
function has-nvidia-device() {
55+
local NVIDIA_VENDOR_ID="10de" # NVIDIA's PCI vendor ID
56+
local NVIDIA_DEVICES
57+
NVIDIA_DEVICES=$(lspci -d "${NVIDIA_VENDOR_ID}::" | wc -l)
58+
if [ ${NVIDIA_DEVICES} -gt 0 ]; then
59+
return 0
60+
fi
61+
return 1
62+
}
63+
64+
# get the version of a registered kernel module using dkms status
65+
function module-version() {
66+
local MODULE_NAME="${1}"
67+
local status_output
68+
status_output=$(${DKMS} status -m "${MODULE_NAME}" 2>/dev/null | head -n 1)
69+
70+
if [ -z "$status_output" ]; then
71+
log "ERROR: No DKMS status found for module: ${MODULE_NAME}"
72+
return 1
73+
fi
74+
75+
# Parse version from dkms status output (format: module/version, kernel, arch: status)
76+
echo "$status_output" | cut -d',' -f1 | cut -d'/' -f2 | cut -d':' -f1 | xargs
77+
}
78+
79+
# load a kernel module from the archives
80+
function load() {
81+
local MODULE_NAME="${1}"
82+
acquire_lock || return 1
83+
84+
log "unpacking: ${MODULE_NAME}"
85+
local MODULE_ARCHIVE="${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/*.tar.gz"
86+
local archives=($MODULE_ARCHIVE)
87+
88+
if [ ${#archives[@]} -eq 0 ]; then
89+
log "ERROR: No archive found for ${MODULE_NAME}"
90+
return 1
91+
fi
92+
93+
${DKMS} ldtarball "${archives[0]}"
94+
log "unpacked: ${MODULE_NAME}"
95+
log "installing: ${MODULE_NAME}"
96+
local MODULE_VERSION
97+
MODULE_VERSION=$(module-version "${MODULE_NAME}")
98+
${DKMS} install -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
99+
log "installed: ${MODULE_NAME}"
100+
}
101+
102+
# remove a kernel module
103+
function remove() {
104+
local MODULE_NAME="${1}"
105+
acquire_lock || return 1
106+
107+
log "removing: ${MODULE_NAME}"
108+
local MODULE_VERSION
109+
MODULE_VERSION=$(module-version "${MODULE_NAME}")
110+
${DKMS} remove -m "${MODULE_NAME}" -v "${MODULE_VERSION}" --all
111+
log "removed: ${MODULE_NAME}"
112+
}
113+
114+
# archive a kernel module
115+
function archive() {
116+
local MODULE_NAME="${1}"
117+
acquire_lock || return 1
118+
119+
log "archiving: ${MODULE_NAME}"
120+
mkdir -p "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}"
121+
local MODULE_VERSION
122+
MODULE_VERSION=$(module-version "${MODULE_NAME}")
123+
${DKMS} mktarball -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
124+
cp /var/lib/dkms/${MODULE_NAME}/${MODULE_VERSION}/tarball/*.tar.gz "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/"
125+
log "archived: ${MODULE_NAME}"
126+
}
127+
128+
# build a kernel module
129+
function build() {
130+
local MODULE_NAME="${1}"
131+
acquire_lock || return 1
132+
133+
local MODULE_VERSION
134+
MODULE_VERSION=$(module-version "${MODULE_NAME}")
135+
${DKMS} build -m "${MODULE_NAME}" -v "${MODULE_VERSION}"
136+
}
137+
138+
function usage() {
139+
cat >&2 << EOF
140+
usage: $0 COMMAND [MODULE_NAME]
141+
142+
Kernel module utilities for dynamic GPU driver management.
143+
Supports load/remove/archive/build/module-version operations for DKMS modules.
144+
145+
COMMANDS:
146+
load MODULE_NAME Load a kernel module from archives
147+
remove MODULE_NAME Remove an installed kernel module
148+
archive MODULE_NAME Archive an installed kernel module
149+
build MODULE_NAME Build a kernel module
150+
module-version MODULE_NAME Get the version of a registered kernel module
151+
152+
EXAMPLES:
153+
$0 load nvidia # Load nvidia module
154+
$0 remove nvidia # Remove nvidia module
155+
$0 archive nvidia # Archive nvidia module for later use
156+
$0 build nvidia # Build nvidia module
157+
$0 module-version nvidia # Get nvidia module version
158+
159+
NOTES:
160+
- Operations are protected by file locking to prevent conflicts
161+
- Archives are stored in $DKMS_ARCHIVE_DIR
162+
EOF
163+
}
164+
165+
function parse_args() {
166+
# Check for at least one argument
167+
if [ "$#" -eq 0 ]; then
168+
usage
169+
exit 1
170+
fi
171+
172+
COMMAND="$1"
173+
MODULE_NAME="${2:-}"
174+
}
175+
176+
function verify_args() {
177+
case "$COMMAND" in
178+
load|remove|archive|build|module-version)
179+
if [ -z "$MODULE_NAME" ]; then
180+
log "ERROR: Command '$COMMAND' requires a module name"
181+
usage
182+
exit 1
183+
fi
184+
;;
185+
has-nvidia-device)
186+
if [ -n "$MODULE_NAME" ]; then
187+
log "ERROR: Command '$COMMAND' takes no arguments"
188+
usage
189+
exit 1
190+
fi
191+
;;
192+
*)
193+
log "ERROR: Unknown command: $COMMAND"
194+
usage
195+
exit 1
196+
;;
197+
esac
198+
}
199+
200+
function main() {
201+
# Cleanup on exit
202+
trap 'release_lock' EXIT
203+
204+
parse_args "$@"
205+
verify_args
206+
207+
case "$COMMAND" in
208+
load|remove|archive|build|module-version)
209+
"${COMMAND}" "$MODULE_NAME"
210+
;;
211+
has-nvidia-device)
212+
"${COMMAND}"
213+
;;
214+
esac
215+
}
216+
217+
main "$@"

0 commit comments

Comments
 (0)