Skip to content

Commit 9542ce2

Browse files
authored
Merge pull request #99 from almaslennikov/module-check
Add MOFED driver pre-flight checks: module deps and userspace users
2 parents 4e25e36 + cd315bc commit 9542ce2

6 files changed

Lines changed: 1020 additions & 9 deletions

File tree

cmd/network-operator-init-container/app/app.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package app
1717
import (
1818
"context"
1919
"fmt"
20+
"strings"
2021
"sync"
2122
"time"
2223

@@ -40,6 +41,7 @@ import (
4041

4142
"github.com/Mellanox/network-operator-init-container/cmd/network-operator-init-container/app/options"
4243
configPgk "github.com/Mellanox/network-operator-init-container/pkg/config"
44+
"github.com/Mellanox/network-operator-init-container/pkg/modules"
4345
"github.com/Mellanox/network-operator-init-container/pkg/utils/version"
4446
)
4547

@@ -137,6 +139,13 @@ func RunNetworkOperatorInitContainer(ctx context.Context, config *rest.Config, o
137139
}
138140
logger.Info("network-operator-init-container configuration", "config", initContCfg.String())
139141

142+
// Module dependency check — fail fast before safe driver loading
143+
if initContCfg.ModuleDependencyCheck.Enable {
144+
if err := runModuleDependencyCheck(ctx, initContCfg, logger); err != nil {
145+
return err
146+
}
147+
}
148+
140149
if !initContCfg.SafeDriverLoad.Enable {
141150
logger.Info("safe driver loading is disabled, exit")
142151
return nil
@@ -234,6 +243,97 @@ func writeCh(ch chan error, err error) {
234243
}
235244
}
236245

246+
// runModuleDependencyCheck performs the module dependency pre-flight check and reports any issues.
247+
func runModuleDependencyCheck(ctx context.Context, initContCfg *configPgk.Config, logger logr.Logger) error {
248+
logger.Info("running module dependency check",
249+
"modules", initContCfg.ModuleDependencyCheck.Modules)
250+
251+
procPath := initContCfg.ModuleDependencyCheck.HostProcPath
252+
if procPath == "" {
253+
procPath = "/proc"
254+
}
255+
sysPath := initContCfg.ModuleDependencyCheck.HostSysPath
256+
if sysPath == "" {
257+
sysPath = "/sys"
258+
}
259+
260+
if initContCfg.ModuleDependencyCheck.UnloadThirdPartyRDMA {
261+
logger.Info("UNLOAD_THIRD_PARTY_RDMA_MODULES is enabled; known third-party RDMA modules will be skipped")
262+
}
263+
264+
checker := modules.NewChecker(
265+
initContCfg.ModuleDependencyCheck.Modules,
266+
initContCfg.ModuleDependencyCheck.UnloadThirdPartyRDMA,
267+
procPath, sysPath, logger)
268+
269+
report, err := checker.RunAllChecks(ctx)
270+
if err != nil {
271+
return fmt.Errorf("module dependency check failed: %w", err)
272+
}
273+
274+
if err := reportPreFlightIssues(logger, report); err != nil {
275+
return err
276+
}
277+
logger.Info("module dependency check passed")
278+
return nil
279+
}
280+
281+
// reportPreFlightIssues logs all pre-flight check issues and returns an error if any were found.
282+
func reportPreFlightIssues(logger logr.Logger, report *modules.DependencyReport) error {
283+
totalIssues := len(report.ThirdPartyRDMA) + len(report.UnknownKernelModules) + len(report.UserspaceIssues)
284+
if totalIssues == 0 {
285+
return nil
286+
}
287+
288+
// Category 1: known third-party RDMA modules (automatable)
289+
if len(report.ThirdPartyRDMA) > 0 {
290+
for _, dep := range report.ThirdPartyRDMA {
291+
logger.Error(fmt.Errorf("third-party RDMA module dependency"),
292+
"third-party RDMA module blocking MOFED driver reload",
293+
"mofedModule", dep.MofedModule,
294+
"dependents", strings.Join(dep.Dependents, ", "))
295+
}
296+
logger.Error(fmt.Errorf("third-party RDMA modules require configuration change"),
297+
"Recommended action: set UNLOAD_THIRD_PARTY_RDMA_MODULES=\"true\" in "+
298+
"NicClusterPolicy ofedDriver env vars to automatically unload known third-party "+
299+
"RDMA modules before driver reload. Verify that no running workloads depend on "+
300+
"these modules before enabling.")
301+
}
302+
303+
// Category 2: unknown kernel modules (error level — manual intervention)
304+
if len(report.UnknownKernelModules) > 0 {
305+
for _, dep := range report.UnknownKernelModules {
306+
logger.Error(fmt.Errorf("unknown kernel module dependency"),
307+
"unrecognized module(s) blocking MOFED driver reload",
308+
"mofedModule", dep.MofedModule,
309+
"dependents", strings.Join(dep.Dependents, ", "))
310+
}
311+
logger.Error(fmt.Errorf("unknown kernel modules require manual intervention"),
312+
"Required action: manually unload or blacklist these modules "+
313+
"before deploying the DOCA driver. Automatic unloading is not supported "+
314+
"for unrecognized modules.")
315+
}
316+
317+
// Category 3: userspace processes (error level — manual intervention)
318+
if len(report.UserspaceIssues) > 0 {
319+
for _, issue := range report.UserspaceIssues {
320+
logger.Error(fmt.Errorf("userspace process holding module"),
321+
"userspace reference(s) blocking MOFED module unload",
322+
"module", issue.Module,
323+
"refcount", issue.Refcount,
324+
"kernelHolders", issue.HolderCount,
325+
"holders", strings.Join(issue.Holders, ", "),
326+
"userspaceRefs", issue.UserspaceCount)
327+
}
328+
logger.Error(fmt.Errorf("userspace processes require manual intervention"),
329+
"Required action: identify and stop processes using MOFED modules. "+
330+
"Run on host: lsof /dev/infiniband/* or fuser -v /dev/infiniband/*. "+
331+
"Common culprits: opensm, ibacm, rdma-ndd, srpd")
332+
}
333+
334+
return fmt.Errorf("pre-flight check found %d issue(s); cannot safely reload MOFED drivers", totalIssues)
335+
}
336+
237337
// SetupWithManager sets up the controller with the Manager.
238338
func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
239339
return ctrl.NewControllerManagedBy(mgr).

cmd/network-operator-init-container/app/app_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func createNode(name string) *corev1.Node {
4747
return node
4848
}
4949

50-
func createConfig(cfg configPgk.Config) {
50+
func createConfig(cfg *configPgk.Config) {
5151
data, err := json.Marshal(cfg)
5252
ExpectWithOffset(1, err).NotTo(HaveOccurred())
5353
err = k8sClient.Create(ctx, &corev1.ConfigMap{
@@ -91,7 +91,7 @@ var _ = Describe("Init container", func() {
9191
defer GinkgoRecover()
9292
opts := newOpts()
9393
opts.NodeName = testNodeName
94-
createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
94+
createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
9595
Enable: true,
9696
Annotation: testAnnotation,
9797
}})
@@ -123,7 +123,7 @@ var _ = Describe("Init container", func() {
123123
defer GinkgoRecover()
124124
opts := newOpts()
125125
opts.NodeName = "unknown-node"
126-
createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
126+
createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
127127
Enable: true,
128128
Annotation: testAnnotation,
129129
}})
@@ -145,7 +145,7 @@ var _ = Describe("Init container", func() {
145145
defer GinkgoRecover()
146146
opts := newOpts()
147147
opts.NodeName = testNodeName
148-
createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
148+
createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
149149
Enable: true,
150150
Annotation: testAnnotation,
151151
}})
@@ -186,7 +186,7 @@ var _ = Describe("Init container", func() {
186186
defer GinkgoRecover()
187187
opts := newOpts()
188188
opts.NodeName = testNodeName
189-
createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
189+
createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
190190
Enable: false,
191191
}})
192192
var err error

pkg/config/config.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,22 @@ func Load(config string) (*Config, error) {
3636
type Config struct {
3737
// configuration options for safeDriverLoading feature
3838
SafeDriverLoad SafeDriverLoadConfig `json:"safeDriverLoad"`
39+
// configuration options for module dependency checking feature
40+
ModuleDependencyCheck ModuleDependencyCheckConfig `json:"moduleDependencyCheck"`
41+
}
42+
43+
// ModuleDependencyCheckConfig contains configuration options for module dependency checking feature
44+
type ModuleDependencyCheckConfig struct {
45+
// enable module dependency checking feature
46+
Enable bool `json:"enable"`
47+
// list of MOFED kernel modules to check for external dependencies
48+
Modules []string `json:"modules"`
49+
// when true, all known third-party RDMA modules are treated as allowed (driver will handle them)
50+
UnloadThirdPartyRDMA bool `json:"unloadThirdPartyRdma"`
51+
// path to the host's /proc filesystem mount inside the container
52+
HostProcPath string `json:"hostProcPath"`
53+
// path to the host's /sys filesystem mount inside the container
54+
HostSysPath string `json:"hostSysPath"`
3955
}
4056

4157
// SafeDriverLoadConfig contains configuration options for safeDriverLoading feature
@@ -51,6 +67,9 @@ func (c *Config) Validate() error {
5167
if c.SafeDriverLoad.Enable && c.SafeDriverLoad.Annotation == "" {
5268
return fmt.Errorf(".safeDriverLoad.annotation is required if safeDriverLoad feature is enabled")
5369
}
70+
if c.ModuleDependencyCheck.Enable && len(c.ModuleDependencyCheck.Modules) == 0 {
71+
return fmt.Errorf(".moduleDependencyCheck.modules is required if moduleDependencyCheck feature is enabled")
72+
}
5473
return nil
5574
}
5675

pkg/config/config_test.go

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,22 @@ import (
2222
configPgk "github.com/Mellanox/network-operator-init-container/pkg/config"
2323
)
2424

25-
func createConfig(cfg configPgk.Config) string {
25+
func createConfig(cfg *configPgk.Config) string {
2626
data, err := json.Marshal(cfg)
2727
ExpectWithOffset(1, err).NotTo(HaveOccurred())
2828
return string(data)
2929
}
3030

3131
var _ = Describe("Config test", func() {
3232
It("Valid - safeDriverLoad disabled", func() {
33-
cfg, err := configPgk.Load(createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
33+
cfg, err := configPgk.Load(createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
3434
Enable: false,
3535
}}))
3636
Expect(err).NotTo(HaveOccurred())
3737
Expect(cfg.SafeDriverLoad.Enable).To(BeFalse())
3838
})
3939
It("Valid - safeDriverLoad enabled", func() {
40-
cfg, err := configPgk.Load(createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
40+
cfg, err := configPgk.Load(createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
4141
Enable: true,
4242
Annotation: "something",
4343
}}))
@@ -50,9 +50,64 @@ var _ = Describe("Config test", func() {
5050
Expect(err).To(HaveOccurred())
5151
})
5252
It("Logical validation failed - no annotation", func() {
53-
_, err := configPgk.Load(createConfig(configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
53+
_, err := configPgk.Load(createConfig(&configPgk.Config{SafeDriverLoad: configPgk.SafeDriverLoadConfig{
5454
Enable: true,
5555
}}))
5656
Expect(err).To(HaveOccurred())
5757
})
58+
It("Valid - moduleDependencyCheck disabled", func() {
59+
cfg, err := configPgk.Load(createConfig(&configPgk.Config{
60+
ModuleDependencyCheck: configPgk.ModuleDependencyCheckConfig{
61+
Enable: false,
62+
},
63+
}))
64+
Expect(err).NotTo(HaveOccurred())
65+
Expect(cfg.ModuleDependencyCheck.Enable).To(BeFalse())
66+
})
67+
It("Valid - moduleDependencyCheck enabled", func() {
68+
cfg, err := configPgk.Load(createConfig(&configPgk.Config{
69+
ModuleDependencyCheck: configPgk.ModuleDependencyCheckConfig{
70+
Enable: true,
71+
Modules: []string{"mlx5_core", "mlx5_ib"},
72+
HostProcPath: "/host/proc",
73+
HostSysPath: "/host/sys",
74+
},
75+
}))
76+
Expect(err).NotTo(HaveOccurred())
77+
Expect(cfg.ModuleDependencyCheck.Enable).To(BeTrue())
78+
Expect(cfg.ModuleDependencyCheck.Modules).To(Equal([]string{"mlx5_core", "mlx5_ib"}))
79+
Expect(cfg.ModuleDependencyCheck.HostProcPath).To(Equal("/host/proc"))
80+
Expect(cfg.ModuleDependencyCheck.HostSysPath).To(Equal("/host/sys"))
81+
})
82+
It("Valid - moduleDependencyCheck enabled with UnloadThirdPartyRDMA", func() {
83+
cfg, err := configPgk.Load(createConfig(&configPgk.Config{
84+
ModuleDependencyCheck: configPgk.ModuleDependencyCheckConfig{
85+
Enable: true,
86+
Modules: []string{"mlx5_core", "ib_core"},
87+
UnloadThirdPartyRDMA: true,
88+
HostProcPath: "/host/proc",
89+
HostSysPath: "/host/sys",
90+
},
91+
}))
92+
Expect(err).NotTo(HaveOccurred())
93+
Expect(cfg.ModuleDependencyCheck.Enable).To(BeTrue())
94+
Expect(cfg.ModuleDependencyCheck.Modules).To(Equal([]string{"mlx5_core", "ib_core"}))
95+
Expect(cfg.ModuleDependencyCheck.UnloadThirdPartyRDMA).To(BeTrue())
96+
})
97+
It("Logical validation failed - moduleDependencyCheck enabled with no modules", func() {
98+
_, err := configPgk.Load(createConfig(&configPgk.Config{
99+
ModuleDependencyCheck: configPgk.ModuleDependencyCheckConfig{
100+
Enable: true,
101+
},
102+
}))
103+
Expect(err).To(HaveOccurred())
104+
})
105+
It("Backward compatible - old config without moduleDependencyCheck field", func() {
106+
// Simulate an old ConfigMap that only has safeDriverLoad
107+
oldJSON := `{"safeDriverLoad":{"enable":false,"annotation":""}}`
108+
cfg, err := configPgk.Load(oldJSON)
109+
Expect(err).NotTo(HaveOccurred())
110+
Expect(cfg.ModuleDependencyCheck.Enable).To(BeFalse())
111+
Expect(cfg.ModuleDependencyCheck.Modules).To(BeNil())
112+
})
58113
})

0 commit comments

Comments
 (0)