addinig vmi network outages

paigerube14 · paigerube14 · commit 66456563d7a5 · 2026-04-17T13:40:27.000-04:00
Signed-off-by: Paige Patton &lt;prubenda@redhat.com&gt;
diff --git a/krkn/scenario_plugins/network_chaos_ng/models.py b/krkn/scenario_plugins/network_chaos_ng/models.py
@@ -20,6 +20,7 @@
 class NetworkChaosScenarioType(Enum):
     Node = 1
     Pod = 2
+    VMI = 3
 
 
 @dataclass
diff --git a/krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py b/krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py
@@ -91,6 +91,35 @@ def get_pod_targets(self, config: BaseNetworkChaosConfig):
                 )
             return [config.target]
 
+    def get_vmi_targets(self, config: BaseNetworkChaosConfig) -> list[str]:
+        """
+        Returns the list of VMI targets in "namespace/vmi-name" format.
+        Supports regex matching on both name (via `target`) and namespace,
+        and optional post-filtering by `label_selector` in "key=value" format.
+        """
+        if not config.namespace:
+            raise Exception("namespace not specified for VMI scenario, aborting")
+        name_regex = config.target if config.target else ".*"
+        vmis = self.kubecli.get_lib_kubernetes().get_vmis(name_regex, config.namespace)
+        if not vmis:
+            return []
+        if config.label_selector:
+            try:
+                label_key, label_value = config.label_selector.split("=", 1)
+            except ValueError:
+                raise Exception(
+                    f"invalid label_selector format: '{config.label_selector}', expected 'key=value'"
+                )
+            vmis = [
+                vmi
+                for vmi in vmis
+                if vmi.get("metadata", {}).get("labels", {}).get(label_key) == label_value
+            ]
+        return [
+            f"{vmi['metadata']['namespace']}/{vmi['metadata']['name']}"
+            for vmi in vmis
+        ]
+
     def __init__(
         self,
         base_network_config: BaseNetworkChaosConfig,
diff --git a/krkn/scenario_plugins/network_chaos_ng/modules/node_interface_down.py b/krkn/scenario_plugins/network_chaos_ng/modules/node_interface_down.py
@@ -1,3 +1,16 @@
+# Copyright 2025 The Krkn Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import queue
 import time
 from typing import Tuple
@@ -84,13 +97,16 @@ def run(self, target: str, error_queue: queue.Queue = None):
             # to the control plane, so exec_cmd_in_pod can no longer reach the pod.
             # The background process runs entirely on the node and fires regardless of
             # control-plane connectivity.
+            # The recovery process runs with all stdio redirected to /dev/null so it does
+            # not hold the exec session's file descriptors open, allowing exec to return
+            # immediately after down_cmds completes.
             recovery_cmds = " && ".join(
                 [f"ip link set {iface} up" for iface in interfaces]
             )
             down_cmds = " && ".join(
                 [f"ip link set {iface} down" for iface in interfaces]
             )
-            cmd = f"(sleep {self.config.test_duration} && {recovery_cmds}) & {down_cmds}"
+            cmd = f"sh -c 'sleep {self.config.test_duration} && {recovery_cmds}' </dev/null >/dev/null 2>&1 & {down_cmds}"
             self.kubecli.get_lib_kubernetes().exec_cmd_in_pod(
                 [cmd], pod_name, self.config.namespace
             )
@@ -101,13 +117,6 @@ def run(self, target: str, error_queue: queue.Queue = None):
                 target,
             )
 
-            log_info(
-                f"waiting {self.config.test_duration} seconds for interface(s) to recover",
-                parallel,
-                target,
-            )
-            time.sleep(self.config.test_duration)
-
             log_info(
                 f"waiting for node {target} to become Ready after interface recovery",
                 parallel,
diff --git a/krkn/scenario_plugins/network_chaos_ng/modules/vmi_network_chaos.py b/krkn/scenario_plugins/network_chaos_ng/modules/vmi_network_chaos.py
@@ -0,0 +1,229 @@
+# Copyright 2025 The Krkn Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import queue
+import time
+from typing import Tuple
+
+from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
+from krkn_lib.utils import get_random_string
+
+from krkn.scenario_plugins.network_chaos_ng.models import (
+    NetworkChaosScenarioType,
+    BaseNetworkChaosConfig,
+    NetworkChaosConfig,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
+    AbstractNetworkChaosModule,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
+    log_info,
+    log_error,
+    deploy_network_chaos_ng_pod,
+    get_pod_default_interface,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_chaos import (
+    common_set_limit_rules,
+    common_delete_limit_rules,
+)
+
+
+class VmiNetworkChaosModule(AbstractNetworkChaosModule):
+
+    def __init__(self, config: NetworkChaosConfig, kubecli: KrknTelemetryOpenshift):
+        super().__init__(config, kubecli)
+        self.config = config
+
+    def run(self, target: str, error_queue: queue.Queue = None):
+        # target is "namespace/vmi-name" as produced by get_vmi_targets()
+        parallel = False
+        if error_queue:
+            parallel = True
+        try:
+            namespace, vmi_name = target.split("/", 1)
+            network_chaos_pod_name = f"vmi-network-chaos-{get_random_string(5)}"
+            container_name = f"fedora-container-{get_random_string(5)}"
+
+            log_info(
+                f"creating workload to inject network chaos in VMI {vmi_name} "
+                f"latency:{str(self.config.latency) if self.config.latency else '0'}, "
+                f"packet drop:{str(self.config.loss) if self.config.loss else '0'} "
+                f"bandwidth restriction:{str(self.config.bandwidth) if self.config.bandwidth else '0'} ",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            # Resolve which node the VMI is running on
+            vmi = self.kubecli.get_lib_kubernetes().get_vmi(vmi_name, namespace)
+            if not vmi:
+                raise Exception(
+                    f"VMI {vmi_name} not found in namespace {namespace}"
+                )
+
+            node_name = vmi.get("status", {}).get("nodeName")
+            if not node_name:
+                raise Exception(
+                    f"unable to determine node for VMI {vmi_name} in namespace {namespace}; "
+                    "VMI may not be in Running phase"
+                )
+
+            log_info(
+                f"VMI {vmi_name} is running on node {node_name}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            # The virt-launcher pod carries the VMI's network namespace.
+            # It is labelled kubevirt.io/domain=<vmi-name>.
+            virt_launcher_pods = self.kubecli.get_lib_kubernetes().list_pods(
+                namespace, label_selector=f"kubevirt.io/domain={vmi_name}"
+            )
+            if not virt_launcher_pods:
+                raise Exception(
+                    f"no virt-launcher pod found for VMI {vmi_name} in namespace {namespace}"
+                )
+            virt_launcher_pod_name = virt_launcher_pods[0]
+
+            log_info(
+                f"resolved virt-launcher pod {virt_launcher_pod_name} for VMI {vmi_name}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            # Deploy the privileged chaos pod onto the VMI's node.
+            # host_network=False so that tc rules are applied via nsenter into
+            # the virt-launcher pod's network namespace rather than the node's.
+            deploy_network_chaos_ng_pod(
+                self.config,
+                node_name,
+                network_chaos_pod_name,
+                self.kubecli.get_lib_kubernetes(),
+                container_name,
+                host_network=False,
+            )
+
+            # Detect the default network interface.  When host_network=False the
+            # chaos pod has its own network namespace; the interface name returned
+            # (typically "eth0") is used as the target interface inside the
+            # virt-launcher's network namespace via nsenter, which also uses "eth0"
+            # as its primary interface.
+            if len(self.config.interfaces) == 0:
+                interfaces = [
+                    get_pod_default_interface(
+                        network_chaos_pod_name,
+                        self.config.namespace,
+                        self.kubecli.get_lib_kubernetes(),
+                    )
+                ]
+                if not interfaces[0]:
+                    log_error(
+                        "no network interface detected; impossible to execute the network chaos scenario",
+                        parallel,
+                        network_chaos_pod_name,
+                    )
+                    self.kubecli.get_lib_kubernetes().delete_pod(
+                        network_chaos_pod_name, self.config.namespace
+                    )
+                    return
+                log_info(
+                    f"detected default interface: {interfaces[0]}",
+                    parallel,
+                    network_chaos_pod_name,
+                )
+            else:
+                interfaces = self.config.interfaces
+
+            # Retrieve the container IDs of the virt-launcher pod so we can
+            # identify the cgroup and resolve the host-visible PIDs via /proc.
+            container_ids = self.kubecli.get_lib_kubernetes().get_container_ids(
+                virt_launcher_pod_name, namespace
+            )
+            if not container_ids:
+                raise Exception(
+                    f"impossible to resolve container ID for virt-launcher pod "
+                    f"{virt_launcher_pod_name} in namespace {namespace}"
+                )
+
+            log_info(
+                f"targeting virt-launcher container {container_ids[0]}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            # Resolve the host PIDs for the virt-launcher container so that
+            # nsenter can enter its network namespace.
+            pids = self.kubecli.get_lib_kubernetes().get_pod_pids(
+                base_pod_name=network_chaos_pod_name,
+                base_pod_namespace=self.config.namespace,
+                base_pod_container_name=container_name,
+                pod_name=virt_launcher_pod_name,
+                pod_namespace=namespace,
+                pod_container_id=container_ids[0],
+            )
+            if not pids:
+                raise Exception(
+                    f"impossible to resolve PIDs for virt-launcher pod {virt_launcher_pod_name}"
+                )
+
+            log_info(
+                f"resolved PIDs {pids} on node {node_name} for VMI {vmi_name}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            common_set_limit_rules(
+                self.config.egress,
+                self.config.ingress,
+                interfaces,
+                self.config.bandwidth,
+                self.config.latency,
+                self.config.loss,
+                parallel,
+                network_chaos_pod_name,
+                self.kubecli.get_lib_kubernetes(),
+                network_chaos_pod_name,
+                self.config.namespace,
+                pids,
+            )
+
+            time.sleep(self.config.test_duration)
+
+            log_info("removing tc rules", parallel, network_chaos_pod_name)
+
+            common_delete_limit_rules(
+                self.config.egress,
+                self.config.ingress,
+                interfaces,
+                network_chaos_pod_name,
+                self.config.namespace,
+                self.kubecli.get_lib_kubernetes(),
+                pids,
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            self.kubecli.get_lib_kubernetes().delete_pod(
+                network_chaos_pod_name, self.config.namespace
+            )
+
+        except Exception as e:
+            if error_queue is None:
+                raise e
+            else:
+                error_queue.put(str(e))
+
+    def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
+        return NetworkChaosScenarioType.VMI, self.config
+
+    def get_targets(self) -> list[str]:
+        return self.get_vmi_targets(self.config)
diff --git a/krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py b/krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py
@@ -36,13 +36,17 @@
 from krkn.scenario_plugins.network_chaos_ng.modules.pod_network_filter import (
     PodNetworkFilterModule,
 )
+from krkn.scenario_plugins.network_chaos_ng.modules.vmi_network_chaos import (
+    VmiNetworkChaosModule,
+)
 
 supported_modules = [
     "node_network_filter",
     "pod_network_filter",
     "pod_network_chaos",
     "node_network_chaos",
     "node_interface_down",
+    "vmi_network_chaos",
 ]
 
 
@@ -87,5 +91,11 @@ def get_instance(
             if len(errors) > 0:
                 raise Exception(f"config validation errors: [{';'.join(errors)}]")
             return NodeInterfaceDownModule(scenario_config, kubecli)
+        if config["id"] == "vmi_network_chaos":
+            scenario_config = NetworkChaosConfig(**config)
+            errors = scenario_config.validate()
+            if len(errors) > 0:
+                raise Exception(f"config validation errors: [{';'.join(errors)}]")
+            return VmiNetworkChaosModule(scenario_config, kubecli)
         else:
             raise Exception(f"invalid network chaos id {config['id']}")
diff --git a/tests/test_node_interface_down.py b/tests/test_node_interface_down.py
@@ -167,13 +167,15 @@ def test_run_recovery_is_scheduled_before_interface_goes_down(self, _mock_log, _
     @patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
     @patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
     def test_run_sleeps_test_duration(self, mock_log, mock_deploy, mock_sleep):
+        # test_duration is embedded in the shell command (sleep {n} && ip link set up),
+        # so no Python time.sleep(test_duration) should be called.
         self.config.test_duration = 45
         self.config.recovery_time = 0
 
         self.module.run("worker-1")
 
         sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
-        self.assertIn(45, sleep_values)
+        self.assertNotIn(45, sleep_values)
 
     @patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
     @patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@@ -185,7 +187,7 @@ def test_run_sleeps_recovery_time_when_set(self, mock_log, mock_deploy, mock_sle
         self.module.run("worker-1")
 
         sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
-        self.assertIn(30, sleep_values)
+        self.assertNotIn(30, sleep_values)
         self.assertIn(15, sleep_values)
 
     @patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@@ -198,7 +200,7 @@ def test_run_no_recovery_sleep_when_zero(self, mock_log, mock_deploy, mock_sleep
         self.module.run("worker-1")
 
         sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
-        self.assertIn(30, sleep_values)
+        self.assertNotIn(30, sleep_values)
         self.assertNotIn(0, sleep_values)
 
     @patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")