@@ -17,6 +17,7 @@ package app
1717import (
1818 "context"
1919 "fmt"
20+ "strings"
2021 "sync"
2122 "time"
2223
@@ -40,6 +41,7 @@ import (
4041
4142 "github.com/Mellanox/network-operator-init-container/cmd/network-operator-init-container/app/options"
4243 configPgk "github.com/Mellanox/network-operator-init-container/pkg/config"
44+ "github.com/Mellanox/network-operator-init-container/pkg/modules"
4345 "github.com/Mellanox/network-operator-init-container/pkg/utils/version"
4446)
4547
@@ -137,6 +139,13 @@ func RunNetworkOperatorInitContainer(ctx context.Context, config *rest.Config, o
137139 }
138140 logger .Info ("network-operator-init-container configuration" , "config" , initContCfg .String ())
139141
142+ // Module dependency check — fail fast before safe driver loading
143+ if initContCfg .ModuleDependencyCheck .Enable {
144+ if err := runModuleDependencyCheck (ctx , initContCfg , logger ); err != nil {
145+ return err
146+ }
147+ }
148+
140149 if ! initContCfg .SafeDriverLoad .Enable {
141150 logger .Info ("safe driver loading is disabled, exit" )
142151 return nil
@@ -234,6 +243,97 @@ func writeCh(ch chan error, err error) {
234243 }
235244}
236245
246+ // runModuleDependencyCheck performs the module dependency pre-flight check and reports any issues.
247+ func runModuleDependencyCheck (ctx context.Context , initContCfg * configPgk.Config , logger logr.Logger ) error {
248+ logger .Info ("running module dependency check" ,
249+ "modules" , initContCfg .ModuleDependencyCheck .Modules )
250+
251+ procPath := initContCfg .ModuleDependencyCheck .HostProcPath
252+ if procPath == "" {
253+ procPath = "/proc"
254+ }
255+ sysPath := initContCfg .ModuleDependencyCheck .HostSysPath
256+ if sysPath == "" {
257+ sysPath = "/sys"
258+ }
259+
260+ if initContCfg .ModuleDependencyCheck .UnloadThirdPartyRDMA {
261+ logger .Info ("UNLOAD_THIRD_PARTY_RDMA_MODULES is enabled; known third-party RDMA modules will be skipped" )
262+ }
263+
264+ checker := modules .NewChecker (
265+ initContCfg .ModuleDependencyCheck .Modules ,
266+ initContCfg .ModuleDependencyCheck .UnloadThirdPartyRDMA ,
267+ procPath , sysPath , logger )
268+
269+ report , err := checker .RunAllChecks (ctx )
270+ if err != nil {
271+ return fmt .Errorf ("module dependency check failed: %w" , err )
272+ }
273+
274+ if err := reportPreFlightIssues (logger , report ); err != nil {
275+ return err
276+ }
277+ logger .Info ("module dependency check passed" )
278+ return nil
279+ }
280+
281+ // reportPreFlightIssues logs all pre-flight check issues and returns an error if any were found.
282+ func reportPreFlightIssues (logger logr.Logger , report * modules.DependencyReport ) error {
283+ totalIssues := len (report .ThirdPartyRDMA ) + len (report .UnknownKernelModules ) + len (report .UserspaceIssues )
284+ if totalIssues == 0 {
285+ return nil
286+ }
287+
288+ // Category 1: known third-party RDMA modules (automatable)
289+ if len (report .ThirdPartyRDMA ) > 0 {
290+ for _ , dep := range report .ThirdPartyRDMA {
291+ logger .Error (fmt .Errorf ("third-party RDMA module dependency" ),
292+ "third-party RDMA module blocking MOFED driver reload" ,
293+ "mofedModule" , dep .MofedModule ,
294+ "dependents" , strings .Join (dep .Dependents , ", " ))
295+ }
296+ logger .Error (fmt .Errorf ("third-party RDMA modules require configuration change" ),
297+ "Recommended action: set UNLOAD_THIRD_PARTY_RDMA_MODULES=\" true\" in " +
298+ "NicClusterPolicy ofedDriver env vars to automatically unload known third-party " +
299+ "RDMA modules before driver reload. Verify that no running workloads depend on " +
300+ "these modules before enabling." )
301+ }
302+
303+ // Category 2: unknown kernel modules (error level — manual intervention)
304+ if len (report .UnknownKernelModules ) > 0 {
305+ for _ , dep := range report .UnknownKernelModules {
306+ logger .Error (fmt .Errorf ("unknown kernel module dependency" ),
307+ "unrecognized module(s) blocking MOFED driver reload" ,
308+ "mofedModule" , dep .MofedModule ,
309+ "dependents" , strings .Join (dep .Dependents , ", " ))
310+ }
311+ logger .Error (fmt .Errorf ("unknown kernel modules require manual intervention" ),
312+ "Required action: manually unload or blacklist these modules " +
313+ "before deploying the DOCA driver. Automatic unloading is not supported " +
314+ "for unrecognized modules." )
315+ }
316+
317+ // Category 3: userspace processes (error level — manual intervention)
318+ if len (report .UserspaceIssues ) > 0 {
319+ for _ , issue := range report .UserspaceIssues {
320+ logger .Error (fmt .Errorf ("userspace process holding module" ),
321+ "userspace reference(s) blocking MOFED module unload" ,
322+ "module" , issue .Module ,
323+ "refcount" , issue .Refcount ,
324+ "kernelHolders" , issue .HolderCount ,
325+ "holders" , strings .Join (issue .Holders , ", " ),
326+ "userspaceRefs" , issue .UserspaceCount )
327+ }
328+ logger .Error (fmt .Errorf ("userspace processes require manual intervention" ),
329+ "Required action: identify and stop processes using MOFED modules. " +
330+ "Run on host: lsof /dev/infiniband/* or fuser -v /dev/infiniband/*. " +
331+ "Common culprits: opensm, ibacm, rdma-ndd, srpd" )
332+ }
333+
334+ return fmt .Errorf ("pre-flight check found %d issue(s); cannot safely reload MOFED drivers" , totalIssues )
335+ }
336+
237337// SetupWithManager sets up the controller with the Manager.
238338func (r * NodeReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
239339 return ctrl .NewControllerManagedBy (mgr ).
0 commit comments