Skip to content

Commit 1c5221b

Browse files
committed
add /api/admin/task-dump api endpoint
This dumps out a trace of all tokio tasks. It is quite expensive, and currently unsettles the tokio runtime such that you need to repeatedly call this endpoint in order for a subsequent graceful shutdown to clock through and complete.
1 parent c151016 commit 1c5221b

9 files changed

Lines changed: 213 additions & 5 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/kumo-server-common/src/http_server/mod.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ impl RouterAndDocs {
183183
add_handlers!(
184184
bump_config_epoch,
185185
memory_stats,
186+
task_dump,
186187
report_metrics,
187188
report_metrics_json,
188189
set_diagnostic_log_filter_v1,
@@ -466,6 +467,41 @@ async fn bump_config_epoch() -> Result<(), AppError> {
466467
Ok(())
467468
}
468469

470+
#[derive(Deserialize)]
471+
struct TaskDumpParams {
472+
#[serde(default)]
473+
timeout: Option<u64>,
474+
}
475+
476+
/// Returns a dump of the runtime task state.
477+
///
478+
/// {{since('dev')}}
479+
///
480+
/// The output is not machine parseable and may change without notice
481+
/// between versions of kumomta.
482+
///
483+
/// Capturing the dump is very expensive and can take several seconds.
484+
/// Capturing a dump is not guaranteed to succeed.
485+
///
486+
/// At the time of writing, capturing a dump can cause a subsequent
487+
/// graceful shutdown to get stuck, unless you repeatedly trigger
488+
/// this API endpoint a few more times to "clock through" the shutdown
489+
/// process and enable it to complete successfully.
490+
#[utoipa::path(
491+
get,
492+
tag="debugging",
493+
path="/api/admin/task-dump",
494+
responses(
495+
(status=200, description="data was returned")
496+
),
497+
)]
498+
async fn task_dump(Query(params): Query<TaskDumpParams>) -> String {
499+
kumo_server_runtime::dump_all_runtimes(tokio::time::Duration::from_secs(
500+
params.timeout.unwrap_or(5),
501+
))
502+
.await
503+
}
504+
469505
/// Returns information about the system memory usage in an unstructured
470506
/// human readable format. The output is not machine parseable and may
471507
/// change without notice between versions of kumomta.

crates/kumo-server-runtime/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ kumo-prometheus = {path="../kumo-prometheus"}
1010
linkme.workspace = true
1111
parking_lot.workspace = true
1212
prometheus = {workspace=true}
13-
tokio = {workspace=true, features=["full", "tracing"]}
13+
tokio = {workspace=true, features=["full", "tracing", "taskdump"]}
1414
tracing = {workspace=true}

crates/kumo-server-runtime/src/lib.rs

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
use anyhow::Context;
22
use kumo_prometheus::declare_metric;
33
use parking_lot::Mutex;
4-
use std::collections::HashMap;
4+
use std::collections::{BTreeMap, HashMap};
5+
use std::fmt::Write;
56
use std::future::Future;
67
use std::sync::atomic::{AtomicUsize, Ordering};
78
use std::sync::{Arc, LazyLock};
89
use tokio::runtime::Handle;
910
use tokio::task::JoinHandle;
11+
use tokio::time::Duration;
1012

1113
pub static RUNTIME: LazyLock<Runtime> =
1214
LazyLock::new(|| Runtime::new("localset", |cpus| cpus / 4, &LOCALSET_THREADS).unwrap());
@@ -54,6 +56,64 @@ struct RuntimeInner {
5456
name_prefix: String,
5557
}
5658

59+
fn runtimes_by_name() -> BTreeMap<String, Runtime> {
60+
RUNTIMES
61+
.lock()
62+
.iter()
63+
.map(|(name, rt)| (name.clone(), rt.clone()))
64+
.collect()
65+
}
66+
67+
#[cfg(not(target_os = "linux"))]
68+
pub async fn dump_all_runtimes(timeout_duration: Duration) -> String {
69+
"Runtime state dumping is not supported on this system".into()
70+
}
71+
72+
// NOTE: at the time of writing, calling this once will prevent a
73+
// subsequent graceful shutdown from completing.
74+
//
75+
// You will need to call this multiple times to allow the graceful
76+
// shutdown to "clock through" and finish successfully.
77+
// I do not know what exactly causes that stutter/stickiness.
78+
#[cfg(target_os = "linux")]
79+
pub async fn dump_all_runtimes(timeout_duration: Duration) -> String {
80+
let runtimes = runtimes_by_name();
81+
let mut dumps = vec![];
82+
83+
async fn collect_dump(
84+
label: &str,
85+
handle: &tokio::runtime::Handle,
86+
timeout_duration: Duration,
87+
) -> String {
88+
match tokio::time::timeout(timeout_duration, handle.dump()).await {
89+
Err(_) => format!("Runtime {label}: Timeout while collecting runtime dump"),
90+
Ok(dump) => {
91+
let label = label.to_string();
92+
match tokio::task::spawn_blocking(move || {
93+
let mut output = format!("Runtime: {label}\n");
94+
for (i, task) in dump.tasks().iter().enumerate() {
95+
let trace = task.trace();
96+
writeln!(&mut output, "{label} TASK {i}:\n{trace}").ok();
97+
}
98+
output
99+
})
100+
.await
101+
.map_err(|err| format!("spawn_blocking: join failed: {err:#}"))
102+
{
103+
Ok(s) | Err(s) => s,
104+
}
105+
}
106+
}
107+
}
108+
109+
dumps.push(collect_dump("main", &get_main_runtime(), timeout_duration).await);
110+
for (label, rt) in runtimes {
111+
dumps.push(collect_dump(&label, rt.handle(), timeout_duration).await);
112+
}
113+
114+
dumps.join("\n\n")
115+
}
116+
57117
#[derive(Clone)]
58118
pub struct Runtime {
59119
inner: Arc<RuntimeInner>,
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
tags:
3+
- debugging
4+
---
5+
# GET /api/admin/task-dump
6+
7+
8+
!!! info
9+
This page was generated by extracting information from a JSON Schema
10+
data file for the API. It may be missing some information, or otherwise
11+
suggest approximate or placeholder values based on information in the
12+
schema file; this is due to limitations on how that data is extracted
13+
from the underlying Rust code and into the JSON Schema, and then again
14+
from there and into these docs.
15+
16+
Returns a dump of the runtime task state.
17+
{{since('dev')}}
18+
19+
The output is not machine parseable and may change without notice
20+
between versions of kumomta.
21+
22+
Capturing the dump is very expensive and can take several seconds.
23+
Capturing a dump is not guaranteed to succeed.
24+
25+
At the time of writing, capturing a dump can cause a subsequent
26+
graceful shutdown to get stuck, unless you repeatedly trigger
27+
this API endpoint a few more times to "clock through" the shutdown
28+
process and enable it to complete successfully.
29+
30+
## Responses
31+
### Status 200
32+
data was returned
33+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
tags:
3+
- debugging
4+
---
5+
# GET /api/admin/task-dump
6+
7+
8+
!!! info
9+
This page was generated by extracting information from a JSON Schema
10+
data file for the API. It may be missing some information, or otherwise
11+
suggest approximate or placeholder values based on information in the
12+
schema file; this is due to limitations on how that data is extracted
13+
from the underlying Rust code and into the JSON Schema, and then again
14+
from there and into these docs.
15+
16+
Returns a dump of the runtime task state.
17+
{{since('dev')}}
18+
19+
The output is not machine parseable and may change without notice
20+
between versions of kumomta.
21+
22+
Capturing the dump is very expensive and can take several seconds.
23+
Capturing a dump is not guaranteed to succeed.
24+
25+
At the time of writing, capturing a dump can cause a subsequent
26+
graceful shutdown to get stuck, unless you repeatedly trigger
27+
this API endpoint a few more times to "clock through" the shutdown
28+
process and enable it to complete successfully.
29+
30+
## Responses
31+
### Status 200
32+
data was returned
33+

docs/reference/kumod.openapi.json

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"license": {
77
"name": "Apache-2.0"
88
},
9-
"version": "2026.02.24-b018f8f1"
9+
"version": "2026.03.11-17b80530"
1010
},
1111
"paths": {
1212
"/api/admin/bounce/v1": {
@@ -465,6 +465,21 @@
465465
}
466466
}
467467
},
468+
"/api/admin/task-dump": {
469+
"get": {
470+
"tags": [
471+
"debugging"
472+
],
473+
"summary": "Returns a dump of the runtime task state.",
474+
"description": "{{since('dev')}}\n\nThe output is not machine parseable and may change without notice\nbetween versions of kumomta.\n\nCapturing the dump is very expensive and can take several seconds.\nCapturing a dump is not guaranteed to succeed.\n\nAt the time of writing, capturing a dump can cause a subsequent\ngraceful shutdown to get stuck, unless you repeatedly trigger\nthis API endpoint a few more times to \"clock through\" the shutdown\nprocess and enable it to complete successfully.",
475+
"operationId": "task_dump",
476+
"responses": {
477+
"200": {
478+
"description": "data was returned"
479+
}
480+
}
481+
}
482+
},
468483
"/api/admin/trace-smtp-client/v1": {
469484
"get": {
470485
"tags": [

docs/reference/proxy-server.openapi.json

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"license": {
77
"name": "Apache-2.0"
88
},
9-
"version": "2026.02.24-3bc3da88"
9+
"version": "2026.03.11-17b80530"
1010
},
1111
"paths": {
1212
"/api/admin/bump-config-epoch": {
@@ -62,6 +62,21 @@
6262
}
6363
}
6464
},
65+
"/api/admin/task-dump": {
66+
"get": {
67+
"tags": [
68+
"debugging"
69+
],
70+
"summary": "Returns a dump of the runtime task state.",
71+
"description": "{{since('dev')}}\n\nThe output is not machine parseable and may change without notice\nbetween versions of kumomta.\n\nCapturing the dump is very expensive and can take several seconds.\nCapturing a dump is not guaranteed to succeed.\n\nAt the time of writing, capturing a dump can cause a subsequent\ngraceful shutdown to get stuck, unless you repeatedly trigger\nthis API endpoint a few more times to \"clock through\" the shutdown\nprocess and enable it to complete successfully.",
72+
"operationId": "task_dump",
73+
"responses": {
74+
"200": {
75+
"description": "data was returned"
76+
}
77+
}
78+
}
79+
},
6580
"/api/machine-info": {
6681
"get": {
6782
"tags": [

docs/reference/tsa-daemon.openapi.json

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"license": {
77
"name": "Apache-2.0"
88
},
9-
"version": "2026.02.24-3bc3da88"
9+
"version": "2026.03.11-17b80530"
1010
},
1111
"paths": {
1212
"/api/admin/bump-config-epoch": {
@@ -62,6 +62,21 @@
6262
}
6363
}
6464
},
65+
"/api/admin/task-dump": {
66+
"get": {
67+
"tags": [
68+
"debugging"
69+
],
70+
"summary": "Returns a dump of the runtime task state.",
71+
"description": "{{since('dev')}}\n\nThe output is not machine parseable and may change without notice\nbetween versions of kumomta.\n\nCapturing the dump is very expensive and can take several seconds.\nCapturing a dump is not guaranteed to succeed.\n\nAt the time of writing, capturing a dump can cause a subsequent\ngraceful shutdown to get stuck, unless you repeatedly trigger\nthis API endpoint a few more times to \"clock through\" the shutdown\nprocess and enable it to complete successfully.",
72+
"operationId": "task_dump",
73+
"responses": {
74+
"200": {
75+
"description": "data was returned"
76+
}
77+
}
78+
}
79+
},
6580
"/api/machine-info": {
6681
"get": {
6782
"tags": [

0 commit comments

Comments
 (0)