Skip to content

Commit 28a230e

Browse files
committed
addinig vmi network outages
Signed-off-by: Paige Patton <[email protected]>
1 parent 9d06239 commit 28a230e

11 files changed

Lines changed: 951 additions & 15 deletions

File tree

config/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ kraken:
5353
- scenarios/kube/node-network-chaos.yml
5454
- scenarios/kube/pod-network-chaos.yml
5555
- scenarios/kube/node_interface_down.yaml
56+
- scenarios/openshift/virt_network.yaml
5657
- kubevirt_vm_outage:
5758
- scenarios/kubevirt/kubevirt-vm-outage.yaml
5859
- http_load_scenarios:

krkn/scenario_plugins/network_chaos_ng/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
class NetworkChaosScenarioType(Enum):
2121
Node = 1
2222
Pod = 2
23+
VMI = 3
2324

2425

2526
@dataclass

krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,35 @@ def get_pod_targets(self, config: BaseNetworkChaosConfig):
9191
)
9292
return [config.target]
9393

94+
def get_vmi_targets(self, config: BaseNetworkChaosConfig) -> list[str]:
95+
"""
96+
Returns the list of VMI targets in "namespace/vmi-name" format.
97+
Supports regex matching on both name (via `target`) and namespace,
98+
and optional post-filtering by `label_selector` in "key=value" format.
99+
"""
100+
if not config.namespace:
101+
raise Exception("namespace not specified for VMI scenario, aborting")
102+
name_regex = config.target if config.target else ".*"
103+
vmis = self.kubecli.get_lib_kubernetes().get_vmis(name_regex, config.namespace)
104+
if not vmis:
105+
return []
106+
if config.label_selector:
107+
try:
108+
label_key, label_value = config.label_selector.split("=", 1)
109+
except ValueError:
110+
raise Exception(
111+
f"invalid label_selector format: '{config.label_selector}', expected 'key=value'"
112+
)
113+
vmis = [
114+
vmi
115+
for vmi in vmis
116+
if vmi.get("metadata", {}).get("labels", {}).get(label_key) == label_value
117+
]
118+
return [
119+
f"{vmi['metadata']['namespace']}/{vmi['metadata']['name']}"
120+
for vmi in vmis
121+
]
122+
94123
def __init__(
95124
self,
96125
base_network_config: BaseNetworkChaosConfig,

krkn/scenario_plugins/network_chaos_ng/modules/node_interface_down.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
# Copyright 2025 The Krkn Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
import queue
216
import time
317
from typing import Tuple
@@ -101,13 +115,6 @@ def run(self, target: str, error_queue: queue.Queue = None):
101115
target,
102116
)
103117

104-
log_info(
105-
f"waiting {self.config.test_duration} seconds for interface(s) to recover",
106-
parallel,
107-
target,
108-
)
109-
time.sleep(self.config.test_duration)
110-
111118
log_info(
112119
f"waiting for node {target} to become Ready after interface recovery",
113120
parallel,

krkn/scenario_plugins/network_chaos_ng/modules/utils.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import yaml
1919
from jinja2 import FileSystemLoader, Environment
2020
from krkn_lib.k8s import KrknKubernetes
21-
from krkn_lib.models.k8s import Pod
2221

2322
from krkn.scenario_plugins.network_chaos_ng.models import (
2423
BaseNetworkChaosConfig,
@@ -110,6 +109,64 @@ def get_pod_default_interface(
110109
return output.replace("\n", "")
111110

112111

112+
def find_virt_launcher_netns_pid(
113+
chaos_pod_name: str, namespace: str, pids: list[str], kubecli: KrknKubernetes
114+
) -> str:
115+
"""Return the first PID that is in the virt-launcher's network namespace.
116+
117+
get_pod_pids returns all PIDs from the compute container's cgroup. Some of
118+
those processes (helpers, privileged threads) run in the HOST network
119+
namespace rather than the pod's netns. nsenter-ing one of those would
120+
target the node's physical interfaces (e.g. ens4) instead of the
121+
virt-launcher's bridge slave.
122+
123+
tap0 is a KubeVirt-specific tap device that only exists inside the
124+
virt-launcher's netns, so its presence is a reliable probe.
125+
"""
126+
for pid in pids:
127+
cmd = f"nsenter --target {pid} --net -- ip link show tap0 2>/dev/null"
128+
try:
129+
out = kubecli.exec_cmd_in_pod([cmd], chaos_pod_name, namespace)
130+
if "tap0" in out:
131+
return pid
132+
except Exception:
133+
continue
134+
return ""
135+
136+
137+
def get_vmi_tap_interface(
138+
chaos_pod_name: str, namespace: str, pid: str, kubecli: KrknKubernetes
139+
) -> str:
140+
"""Find the VMI's primary tap interface inside the virt-launcher network namespace.
141+
142+
The tap device is the VM-facing member of the KubeVirt bridge:
143+
ovn-udn1-nic -> k6t-ovn-udn1 (bridge) -> tap0 -> QEMU (VM guest)
144+
145+
We locate it by finding the tap member of the k6t-* bridge rather than
146+
grepping for any tap-prefixed device, so the detection works regardless
147+
of how many interfaces the VM has.
148+
149+
Blocking the tap interface isolates only this VMI. Blocking the bridge
150+
slave (ovn-udn1-nic) would also sever OVN's BFD heartbeats and trigger
151+
a node-wide network reconvergence.
152+
"""
153+
# Find the k6t-* bridge name first, then find its tap member.
154+
bridge_cmd = (
155+
f"nsenter --target {pid} --net -- "
156+
f"ip link show | grep ': k6t-' | head -1 | cut -d: -f2 | tr -d ' '"
157+
)
158+
bridge = kubecli.exec_cmd_in_pod([bridge_cmd], chaos_pod_name, namespace).strip()
159+
if not bridge:
160+
return ""
161+
162+
tap_cmd = (
163+
f"nsenter --target {pid} --net -- "
164+
f"ip link show master {bridge} | grep ': tap' | head -1 | cut -d: -f2 | tr -d ' '"
165+
)
166+
output = kubecli.exec_cmd_in_pod([tap_cmd], chaos_pod_name, namespace)
167+
return output.strip()
168+
169+
113170
def setup_network_chaos_ng_scenario(
114171
config: BaseNetworkChaosConfig,
115172
node_name: str,

krkn/scenario_plugins/network_chaos_ng/modules/utils_network_filter.py

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,29 @@ def generate_rules(
2525
input_rules = []
2626
output_rules = []
2727
for interface in interfaces:
28-
for port in config.ports:
28+
if config.ports:
29+
for port in config.ports:
30+
if config.egress:
31+
for protocol in set(config.protocols):
32+
output_rules.append(
33+
f"iptables -I OUTPUT 1 -p {protocol} --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
34+
)
35+
if config.ingress:
36+
for protocol in set(config.protocols):
37+
input_rules.append(
38+
f"iptables -I INPUT 1 -i {interface} -p {protocol} --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
39+
)
40+
else:
41+
# empty ports means block all traffic on all ports
2942
if config.egress:
3043
for protocol in set(config.protocols):
3144
output_rules.append(
32-
f"iptables -I OUTPUT 1 -p {protocol} --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
45+
f"iptables -I OUTPUT 1 -p {protocol} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
3346
)
34-
3547
if config.ingress:
3648
for protocol in set(config.protocols):
3749
input_rules.append(
38-
f"iptables -I INPUT 1 -i {interface} -p {protocol} --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
50+
f"iptables -I INPUT 1 -i {interface} -p {protocol} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
3951
)
4052
return input_rules, output_rules
4153

@@ -115,3 +127,64 @@ def generate_namespaced_rules(
115127
namespaced_output_rules.extend(ns_output_rules)
116128

117129
return namespaced_input_rules, namespaced_output_rules
130+
131+
132+
def apply_tc_vmi_chaos(
133+
kubecli: KrknKubernetes,
134+
chaos_pod_name: str,
135+
namespace: str,
136+
pid: str,
137+
iface: str,
138+
parallel: bool,
139+
vmi_name: str,
140+
):
141+
"""Block all traffic on the VMI's tap interface using tc.
142+
143+
Targets tap0 (the VM-facing end of the KubeVirt bridge) rather than the
144+
bridge slave (ovn-udn1-nic). Blocking the bridge slave also cuts OVN's
145+
BFD heartbeats and causes a node-wide network reconvergence; tap0 only
146+
connects to QEMU so blocking it isolates only this VMI.
147+
148+
tc operates at the device layer below iptables and works without br_netfilter:
149+
- root netem loss 100% -> drops traffic sent toward the VM
150+
- ingress + matchall -> drops traffic sent by the VM
151+
Only one pid is needed because all processes in the container share a netns.
152+
"""
153+
ns = f"nsenter --target {pid} --net --"
154+
log_info(f"applying tc block on {iface} (egress netem + ingress drop)", parallel, vmi_name)
155+
kubecli.exec_cmd_in_pod(
156+
[f"{ns} tc qdisc add dev {iface} root netem loss 100%"],
157+
chaos_pod_name,
158+
namespace,
159+
)
160+
kubecli.exec_cmd_in_pod(
161+
[f"{ns} tc qdisc add dev {iface} ingress"],
162+
chaos_pod_name,
163+
namespace,
164+
)
165+
kubecli.exec_cmd_in_pod(
166+
[f"{ns} tc filter add dev {iface} parent ffff: protocol all matchall action drop"],
167+
chaos_pod_name,
168+
namespace,
169+
)
170+
171+
172+
def clean_tc_vmi_chaos(
173+
kubecli: KrknKubernetes,
174+
chaos_pod_name: str,
175+
namespace: str,
176+
pid: str,
177+
iface: str,
178+
):
179+
"""Remove tc qdiscs applied by apply_tc_vmi_chaos."""
180+
ns = f"nsenter --target {pid} --net --"
181+
for cmd in [
182+
f"{ns} tc qdisc del dev {iface} root",
183+
f"{ns} tc qdisc del dev {iface} ingress",
184+
]:
185+
try:
186+
kubecli.exec_cmd_in_pod([cmd], chaos_pod_name, namespace)
187+
except Exception:
188+
pass
189+
190+

0 commit comments

Comments
 (0)