Skip to content

Commit 3b14587

Browse files
committed
fix delayed shutdown for non-smtp delivery handlers
Occasionally we'll have someone report that systemd timed out and sigkill'd their kumo on shutdown. One possible scenario for this is a lua delivery handler that is taking too long, presumably because the other end of it (eg: webhook or other custom endpoint) is not responding in a timely fashion. The way that we handle shutdown is that we compute a maximum theoretical timeout value by summing up all of the smtp client timeout values. Some of those can be several minutes in duration because the are using default values derived from a very conservative set of values suggested by the SMTP RFCs from the '70s. Those obviously should not apply to a custom delivery handler, but also, in the context of an established SMTP session, we should not add in the connection-establishment-specific values when we're just waiting for a per-message send. This commit addresses this situation on two fronts: * Introduce a new system_shutdown_timeout value that allows the user to conveniently express their desired timeout value in a single option. This is *not* set by default! * The default value for system_shutdown_timeout is computed by summing the per-message-delivery smtp timeout options, which is a much more reasonable, and more importantly, shorter than our 300s TimeoutStopSec value in kumomta.service
1 parent 7d3869d commit 3b14587

7 files changed

Lines changed: 63 additions & 5 deletions

File tree

assets/policy-extras/shaping.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,3 +232,4 @@ connection_limit = "local:5"
232232
# Allow a healthy amount in the ready queue in case things get
233233
# busy
234234
max_ready = 10000
235+
system_shutdown_timeout = "30s"

crates/kumo-api-types/src/egress_path.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,12 @@ pub struct EgressPathConfig {
215215
#[serde(flatten)]
216216
pub client_timeouts: SmtpClientTimeouts,
217217

218+
/// How long to wait for an established session to gracefully
219+
/// close when the system is shutting down. After this period
220+
/// has elapsed, sessions will be aborted.
221+
#[serde(default, with = "duration_serde")]
222+
pub system_shutdown_timeout: Option<Duration>,
223+
218224
#[serde(default = "EgressPathConfig::default_max_ready")]
219225
pub max_ready: usize,
220226

@@ -342,6 +348,7 @@ impl Default for EgressPathConfig {
342348
max_connection_rate: None,
343349
max_deliveries_per_connection: Self::default_max_deliveries_per_connection(),
344350
client_timeouts: SmtpClientTimeouts::default(),
351+
system_shutdown_timeout: None,
345352
prohibited_hosts: CidrSet::default_prohibited_hosts(),
346353
skip_hosts: CidrSet::default(),
347354
ehlo_domain: None,

crates/kumo-api-types/src/shaping.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1948,6 +1948,7 @@ MergedEntry {
19481948
starttls_timeout: 5s,
19491949
auth_timeout: 60s,
19501950
},
1951+
system_shutdown_timeout: None,
19511952
max_ready: 1024,
19521953
consecutive_connection_failures_before_delay: 100,
19531954
smtp_port: 25,
@@ -2092,6 +2093,7 @@ MergedEntry {
20922093
starttls_timeout: 5s,
20932094
auth_timeout: 60s,
20942095
},
2096+
system_shutdown_timeout: None,
20952097
max_ready: 1024,
20962098
consecutive_connection_failures_before_delay: 100,
20972099
smtp_port: 25,
@@ -2153,6 +2155,7 @@ MergedEntry {
21532155
starttls_timeout: 5s,
21542156
auth_timeout: 60s,
21552157
},
2158+
system_shutdown_timeout: None,
21562159
max_ready: 1024,
21572160
consecutive_connection_failures_before_delay: 100,
21582161
smtp_port: 25,
@@ -2299,6 +2302,7 @@ MergedEntry {
22992302
starttls_timeout: 5s,
23002303
auth_timeout: 60s,
23012304
},
2305+
system_shutdown_timeout: None,
23022306
max_ready: 1024,
23032307
consecutive_connection_failures_before_delay: 100,
23042308
smtp_port: 25,

crates/kumod/src/ready_queue.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,11 @@ impl ReadyQueueManager {
571571
_ = wait_for_shutdown => {
572572
shutting_down = true;
573573
if force_reap_deadline.is_none() {
574-
let duration = queue.path_config.borrow().client_timeouts.total_message_send_duration();
574+
let path_config = queue.path_config.borrow();
575+
576+
let duration = path_config.system_shutdown_timeout.unwrap_or_else(|| {
577+
path_config.client_timeouts.total_message_send_duration()
578+
});
575579
force_reap_deadline.replace(tokio::time::Instant::now() + duration);
576580
tracing::debug!("{name}: reap deadline in {duration:?}");
577581
}
@@ -918,7 +922,9 @@ impl ReadyQueue {
918922
return;
919923
}
920924

921-
let lease_duration = path_config.client_timeouts.total_message_send_duration();
925+
let lease_duration = path_config
926+
.client_timeouts
927+
.total_connection_lifetime_duration();
922928
let limit_name = format!("kumomta.connection_limit.{}", self.name);
923929
let mut limits = vec![(
924930
&limit_name,
@@ -1931,7 +1937,7 @@ impl Dispatcher {
19311937
self.path_config
19321938
.borrow()
19331939
.client_timeouts
1934-
.total_message_send_duration(),
1940+
.total_connection_lifetime_duration(),
19351941
)
19361942
.await
19371943
.is_err()

crates/rfc5321/src/client_types.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,9 @@ impl SmtpClientTimeouts {
138138
}
139139
}
140140

141-
/// Compute theoretical maximum lifetime of a single message send
142-
pub fn total_message_send_duration(&self) -> Duration {
141+
/// Compute theoretical maximum lifetime of a connection when
142+
/// sending a single message
143+
pub fn total_connection_lifetime_duration(&self) -> Duration {
143144
self.connect_timeout
144145
+ self.banner_timeout
145146
+ self.ehlo_timeout
@@ -151,6 +152,15 @@ impl SmtpClientTimeouts {
151152
+ self.starttls_timeout
152153
+ self.idle_timeout
153154
}
155+
156+
/// Compute theoretical maximum lifetime of a single message send
157+
/// on an already established connection
158+
pub fn total_message_send_duration(&self) -> Duration {
159+
self.mail_from_timeout
160+
+ self.rcpt_to_timeout
161+
+ self.data_timeout
162+
+ self.data_dot_timeout
163+
}
154164
}
155165

156166
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Hash)]

docs/changelog/main.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@
4747
version, you may need to review and adjust both that and the corresponding
4848
[kumo.set_ready_qmaint_threads](../reference/kumo/set_ready_qmaint_threads.md)
4949
tuning.
50+
* New
51+
[system_shutdown_timeout](../reference/kumo/make_egress_path/system_shutdown_timeout.md)
52+
option to specify how long we should wait for an in-flight delivery attempt
53+
to wrap up before terminating it once we have received a request to shutdown
54+
kumod.
5055

5156
## Fixes
5257

@@ -56,3 +61,6 @@
5661
* tsa-daemon now increases its soft `NOFILE` limit to match its hard limit
5762
on startup (just as we do in kumod), which helps to avoid issues with
5863
running out of file descriptors on systems with very large core counts.
64+
* Shutdown could take longer than the 300s permitted by kumomta.service
65+
when lua delivery handlers are experiencing delays, leading to systemd
66+
issuing a SIGKILL.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# system_shutdown_timeout
2+
3+
{{since('dev')}}
4+
5+
How long to wait for an in-flight delivery attempt to gracefully complete when
6+
the system is being shutdown. Once this timeout is reached, any open sessions
7+
will be aborted.
8+
9+
The default value is computed by summing up the per-message-delivery timeout
10+
values:
11+
12+
* [mail_from_timeout](mail_from_timeout.md) +
13+
* [rcpt_to_timeout](rcpt_to_timeout.md) +
14+
* [data_timeout](data_timeout.md) +
15+
* [data_dot_timeout](data_dot_timeout.md)
16+
17+
!!! caution
18+
If you make this value too short you increase the risk duplicate delivery
19+
when the sessions are terminated.
20+
21+
22+

0 commit comments

Comments
 (0)