Skip to content

Commit e9379ed

Browse files
committed
addinig vmi network outages
Signed-off-by: Paige Patton <prubenda@redhat.com>
1 parent e1d4f1e commit e9379ed

6 files changed

Lines changed: 291 additions & 11 deletions

File tree

krkn/scenario_plugins/network_chaos_ng/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
class NetworkChaosScenarioType(Enum):
2121
Node = 1
2222
Pod = 2
23+
VMI = 3
2324

2425

2526
@dataclass

krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,35 @@ def get_pod_targets(self, config: BaseNetworkChaosConfig):
9191
)
9292
return [config.target]
9393

94+
def get_vmi_targets(self, config: BaseNetworkChaosConfig) -> list[str]:
95+
"""
96+
Returns the list of VMI targets in "namespace/vmi-name" format.
97+
Supports regex matching on both name (via `target`) and namespace,
98+
and optional post-filtering by `label_selector` in "key=value" format.
99+
"""
100+
if not config.namespace:
101+
raise Exception("namespace not specified for VMI scenario, aborting")
102+
name_regex = config.target if config.target else ".*"
103+
vmis = self.kubecli.get_lib_kubernetes().get_vmis(name_regex, config.namespace)
104+
if not vmis:
105+
return []
106+
if config.label_selector:
107+
try:
108+
label_key, label_value = config.label_selector.split("=", 1)
109+
except ValueError:
110+
raise Exception(
111+
f"invalid label_selector format: '{config.label_selector}', expected 'key=value'"
112+
)
113+
vmis = [
114+
vmi
115+
for vmi in vmis
116+
if vmi.get("metadata", {}).get("labels", {}).get(label_key) == label_value
117+
]
118+
return [
119+
f"{vmi['metadata']['namespace']}/{vmi['metadata']['name']}"
120+
for vmi in vmis
121+
]
122+
94123
def __init__(
95124
self,
96125
base_network_config: BaseNetworkChaosConfig,

krkn/scenario_plugins/network_chaos_ng/modules/node_interface_down.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
# Copyright 2025 The Krkn Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
114
import queue
215
import time
316
from typing import Tuple
@@ -84,13 +97,16 @@ def run(self, target: str, error_queue: queue.Queue = None):
8497
# to the control plane, so exec_cmd_in_pod can no longer reach the pod.
8598
# The background process runs entirely on the node and fires regardless of
8699
# control-plane connectivity.
100+
# The recovery process runs with all stdio redirected to /dev/null so it does
101+
# not hold the exec session's file descriptors open, allowing exec to return
102+
# immediately after down_cmds completes.
87103
recovery_cmds = " && ".join(
88104
[f"ip link set {iface} up" for iface in interfaces]
89105
)
90106
down_cmds = " && ".join(
91107
[f"ip link set {iface} down" for iface in interfaces]
92108
)
93-
cmd = f"(sleep {self.config.test_duration} && {recovery_cmds}) & {down_cmds}"
109+
cmd = f"sh -c 'sleep {self.config.test_duration} && {recovery_cmds}' </dev/null >/dev/null 2>&1 & {down_cmds}"
94110
self.kubecli.get_lib_kubernetes().exec_cmd_in_pod(
95111
[cmd], pod_name, self.config.namespace
96112
)
@@ -101,13 +117,6 @@ def run(self, target: str, error_queue: queue.Queue = None):
101117
target,
102118
)
103119

104-
log_info(
105-
f"waiting {self.config.test_duration} seconds for interface(s) to recover",
106-
parallel,
107-
target,
108-
)
109-
time.sleep(self.config.test_duration)
110-
111120
log_info(
112121
f"waiting for node {target} to become Ready after interface recovery",
113122
parallel,
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
# Copyright 2025 The Krkn Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import queue
15+
import time
16+
from typing import Tuple
17+
18+
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
19+
from krkn_lib.utils import get_random_string
20+
21+
from krkn.scenario_plugins.network_chaos_ng.models import (
22+
NetworkChaosScenarioType,
23+
BaseNetworkChaosConfig,
24+
NetworkChaosConfig,
25+
)
26+
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
27+
AbstractNetworkChaosModule,
28+
)
29+
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
30+
log_info,
31+
log_error,
32+
deploy_network_chaos_ng_pod,
33+
get_pod_default_interface,
34+
)
35+
from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_chaos import (
36+
common_set_limit_rules,
37+
common_delete_limit_rules,
38+
)
39+
40+
41+
class VmiNetworkChaosModule(AbstractNetworkChaosModule):
42+
43+
def __init__(self, config: NetworkChaosConfig, kubecli: KrknTelemetryOpenshift):
44+
super().__init__(config, kubecli)
45+
self.config = config
46+
47+
def run(self, target: str, error_queue: queue.Queue = None):
48+
# target is "namespace/vmi-name" as produced by get_vmi_targets()
49+
parallel = False
50+
if error_queue:
51+
parallel = True
52+
try:
53+
namespace, vmi_name = target.split("/", 1)
54+
network_chaos_pod_name = f"vmi-network-chaos-{get_random_string(5)}"
55+
container_name = f"fedora-container-{get_random_string(5)}"
56+
57+
log_info(
58+
f"creating workload to inject network chaos in VMI {vmi_name} "
59+
f"latency:{str(self.config.latency) if self.config.latency else '0'}, "
60+
f"packet drop:{str(self.config.loss) if self.config.loss else '0'} "
61+
f"bandwidth restriction:{str(self.config.bandwidth) if self.config.bandwidth else '0'} ",
62+
parallel,
63+
network_chaos_pod_name,
64+
)
65+
66+
# Resolve which node the VMI is running on
67+
vmi = self.kubecli.get_lib_kubernetes().get_vmi(vmi_name, namespace)
68+
if not vmi:
69+
raise Exception(
70+
f"VMI {vmi_name} not found in namespace {namespace}"
71+
)
72+
73+
node_name = vmi.get("status", {}).get("nodeName")
74+
if not node_name:
75+
raise Exception(
76+
f"unable to determine node for VMI {vmi_name} in namespace {namespace}; "
77+
"VMI may not be in Running phase"
78+
)
79+
80+
log_info(
81+
f"VMI {vmi_name} is running on node {node_name}",
82+
parallel,
83+
network_chaos_pod_name,
84+
)
85+
86+
# The virt-launcher pod carries the VMI's network namespace.
87+
# It is labelled kubevirt.io/domain=<vmi-name>.
88+
virt_launcher_pods = self.kubecli.get_lib_kubernetes().list_pods(
89+
namespace, label_selector=f"kubevirt.io/domain={vmi_name}"
90+
)
91+
if not virt_launcher_pods:
92+
raise Exception(
93+
f"no virt-launcher pod found for VMI {vmi_name} in namespace {namespace}"
94+
)
95+
virt_launcher_pod_name = virt_launcher_pods[0]
96+
97+
log_info(
98+
f"resolved virt-launcher pod {virt_launcher_pod_name} for VMI {vmi_name}",
99+
parallel,
100+
network_chaos_pod_name,
101+
)
102+
103+
# Deploy the privileged chaos pod onto the VMI's node.
104+
# host_network=False so that tc rules are applied via nsenter into
105+
# the virt-launcher pod's network namespace rather than the node's.
106+
deploy_network_chaos_ng_pod(
107+
self.config,
108+
node_name,
109+
network_chaos_pod_name,
110+
self.kubecli.get_lib_kubernetes(),
111+
container_name,
112+
host_network=False,
113+
)
114+
115+
# Detect the default network interface. When host_network=False the
116+
# chaos pod has its own network namespace; the interface name returned
117+
# (typically "eth0") is used as the target interface inside the
118+
# virt-launcher's network namespace via nsenter, which also uses "eth0"
119+
# as its primary interface.
120+
if len(self.config.interfaces) == 0:
121+
interfaces = [
122+
get_pod_default_interface(
123+
network_chaos_pod_name,
124+
self.config.namespace,
125+
self.kubecli.get_lib_kubernetes(),
126+
)
127+
]
128+
if not interfaces[0]:
129+
log_error(
130+
"no network interface detected; impossible to execute the network chaos scenario",
131+
parallel,
132+
network_chaos_pod_name,
133+
)
134+
self.kubecli.get_lib_kubernetes().delete_pod(
135+
network_chaos_pod_name, self.config.namespace
136+
)
137+
return
138+
log_info(
139+
f"detected default interface: {interfaces[0]}",
140+
parallel,
141+
network_chaos_pod_name,
142+
)
143+
else:
144+
interfaces = self.config.interfaces
145+
146+
# Retrieve the container IDs of the virt-launcher pod so we can
147+
# identify the cgroup and resolve the host-visible PIDs via /proc.
148+
container_ids = self.kubecli.get_lib_kubernetes().get_container_ids(
149+
virt_launcher_pod_name, namespace
150+
)
151+
if not container_ids:
152+
raise Exception(
153+
f"impossible to resolve container ID for virt-launcher pod "
154+
f"{virt_launcher_pod_name} in namespace {namespace}"
155+
)
156+
157+
log_info(
158+
f"targeting virt-launcher container {container_ids[0]}",
159+
parallel,
160+
network_chaos_pod_name,
161+
)
162+
163+
# Resolve the host PIDs for the virt-launcher container so that
164+
# nsenter can enter its network namespace.
165+
pids = self.kubecli.get_lib_kubernetes().get_pod_pids(
166+
base_pod_name=network_chaos_pod_name,
167+
base_pod_namespace=self.config.namespace,
168+
base_pod_container_name=container_name,
169+
pod_name=virt_launcher_pod_name,
170+
pod_namespace=namespace,
171+
pod_container_id=container_ids[0],
172+
)
173+
if not pids:
174+
raise Exception(
175+
f"impossible to resolve PIDs for virt-launcher pod {virt_launcher_pod_name}"
176+
)
177+
178+
log_info(
179+
f"resolved PIDs {pids} on node {node_name} for VMI {vmi_name}",
180+
parallel,
181+
network_chaos_pod_name,
182+
)
183+
184+
common_set_limit_rules(
185+
self.config.egress,
186+
self.config.ingress,
187+
interfaces,
188+
self.config.bandwidth,
189+
self.config.latency,
190+
self.config.loss,
191+
parallel,
192+
network_chaos_pod_name,
193+
self.kubecli.get_lib_kubernetes(),
194+
network_chaos_pod_name,
195+
self.config.namespace,
196+
pids,
197+
)
198+
199+
time.sleep(self.config.test_duration)
200+
201+
log_info("removing tc rules", parallel, network_chaos_pod_name)
202+
203+
common_delete_limit_rules(
204+
self.config.egress,
205+
self.config.ingress,
206+
interfaces,
207+
network_chaos_pod_name,
208+
self.config.namespace,
209+
self.kubecli.get_lib_kubernetes(),
210+
pids,
211+
parallel,
212+
network_chaos_pod_name,
213+
)
214+
215+
self.kubecli.get_lib_kubernetes().delete_pod(
216+
network_chaos_pod_name, self.config.namespace
217+
)
218+
219+
except Exception as e:
220+
if error_queue is None:
221+
raise e
222+
else:
223+
error_queue.put(str(e))
224+
225+
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
226+
return NetworkChaosScenarioType.VMI, self.config
227+
228+
def get_targets(self) -> list[str]:
229+
return self.get_vmi_targets(self.config)

krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,17 @@
3636
from krkn.scenario_plugins.network_chaos_ng.modules.pod_network_filter import (
3737
PodNetworkFilterModule,
3838
)
39+
from krkn.scenario_plugins.network_chaos_ng.modules.vmi_network_chaos import (
40+
VmiNetworkChaosModule,
41+
)
3942

4043
supported_modules = [
4144
"node_network_filter",
4245
"pod_network_filter",
4346
"pod_network_chaos",
4447
"node_network_chaos",
4548
"node_interface_down",
49+
"vmi_network_chaos",
4650
]
4751

4852

@@ -87,5 +91,11 @@ def get_instance(
8791
if len(errors) > 0:
8892
raise Exception(f"config validation errors: [{';'.join(errors)}]")
8993
return NodeInterfaceDownModule(scenario_config, kubecli)
94+
if config["id"] == "vmi_network_chaos":
95+
scenario_config = NetworkChaosConfig(**config)
96+
errors = scenario_config.validate()
97+
if len(errors) > 0:
98+
raise Exception(f"config validation errors: [{';'.join(errors)}]")
99+
return VmiNetworkChaosModule(scenario_config, kubecli)
90100
else:
91101
raise Exception(f"invalid network chaos id {config['id']}")

tests/test_node_interface_down.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,15 @@ def test_run_recovery_is_scheduled_before_interface_goes_down(self, _mock_log, _
167167
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
168168
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
169169
def test_run_sleeps_test_duration(self, mock_log, mock_deploy, mock_sleep):
170+
# test_duration is embedded in the shell command (sleep {n} && ip link set up),
171+
# so no Python time.sleep(test_duration) should be called.
170172
self.config.test_duration = 45
171173
self.config.recovery_time = 0
172174

173175
self.module.run("worker-1")
174176

175177
sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
176-
self.assertIn(45, sleep_values)
178+
self.assertNotIn(45, sleep_values)
177179

178180
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
179181
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@@ -185,7 +187,7 @@ def test_run_sleeps_recovery_time_when_set(self, mock_log, mock_deploy, mock_sle
185187
self.module.run("worker-1")
186188

187189
sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
188-
self.assertIn(30, sleep_values)
190+
self.assertNotIn(30, sleep_values)
189191
self.assertIn(15, sleep_values)
190192

191193
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@@ -198,7 +200,7 @@ def test_run_no_recovery_sleep_when_zero(self, mock_log, mock_deploy, mock_sleep
198200
self.module.run("worker-1")
199201

200202
sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
201-
self.assertIn(30, sleep_values)
203+
self.assertNotIn(30, sleep_values)
202204
self.assertNotIn(0, sleep_values)
203205

204206
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")

0 commit comments

Comments
 (0)