11import { existsSync , mkdirSync , readFileSync , statSync , writeFileSync } from "node:fs" ;
2+ import { spawnSync } from "node:child_process" ;
23import { homedir } from "node:os" ;
34import { join } from "node:path" ;
45import { sanitizeIdentifier } from "../schema/sanitizer.js" ;
@@ -7,12 +8,15 @@ import { createFlairClient, defaultFlairKeyPath, type FlairAgent } from "../util
78const DEFAULT_INTERVAL_SECONDS = 60 ;
89const STALE_MS = 5 * 60 * 1000 ;
910const CURSOR_DIR = join ( process . env . HOME || homedir ( ) , ".tps" , "cursors" ) ;
11+ const PULSE_STATE_PATH = join ( process . env . HOME || homedir ( ) , ".tps" , "pulse" , "state.json" ) ;
1012const STATE_DIR = join ( process . env . HOME || homedir ( ) , ".tps" , "office-health" ) ;
1113const STATE_PATH = join ( STATE_DIR , "state.json" ) ;
14+ const LOCAL_AGENT_IDS = [ "ember" , "sherlock" , "kern" , "pixel" ] as const ;
1215
1316export interface OfficeHealthArgs {
1417 interval ?: number ;
1518 json ?: boolean ;
19+ local ?: boolean ;
1620 viewerId ?: string ;
1721 flairUrl ?: string ;
1822 keyPath ?: string ;
@@ -39,13 +43,28 @@ export interface AgentHealthRecord {
3943 eventPublished : boolean ;
4044}
4145
46+ export interface LocalHealthRecord {
47+ agentId : string ;
48+ processCount : number ;
49+ pids : number [ ] ;
50+ healthy : boolean ;
51+ }
52+
53+ export interface LocalHealthResult {
54+ stuckMailProcesses : number ;
55+ agents : LocalHealthRecord [ ] ;
56+ pulseRunning : boolean ;
57+ pulseLastPoll : string | null ;
58+ }
59+
4260export interface OfficeHealthTickResult {
4361 timestamp : string ;
4462 viewerId : string ;
4563 checkedAgents : number ;
4664 staleAgents : number ;
4765 publishedEvents : number ;
4866 agents : AgentHealthRecord [ ] ;
67+ local ?: LocalHealthResult ;
4968}
5069
5170function fail ( message : string ) : never {
@@ -146,22 +165,74 @@ function buildIssues(agent: FlairAgent, nowMs: number): Omit<AgentHealthRecord,
146165 } ;
147166}
148167
168+ function readPids ( pattern : string ) : number [ ] {
169+ const result = spawnSync ( "pgrep" , [ "-f" , pattern ] , { encoding : "utf-8" } ) ;
170+ if ( result . status !== 0 || ! result . stdout . trim ( ) ) return [ ] ;
171+ return result . stdout
172+ . split ( "\n" )
173+ . map ( ( line ) => Number ( line . trim ( ) ) )
174+ . filter ( ( pid ) => Number . isInteger ( pid ) && pid > 0 ) ;
175+ }
176+
177+ export function checkLocalHealth ( ) : LocalHealthResult {
178+ const agents = LOCAL_AGENT_IDS . map ( ( agentId ) => {
179+ const pids = readPids ( `agent start --id ${ agentId } ` ) ;
180+ return {
181+ agentId,
182+ processCount : pids . length ,
183+ pids,
184+ healthy : pids . length === 1 ,
185+ } ;
186+ } ) ;
187+
188+ let pulseLastPoll : string | null = null ;
189+ if ( existsSync ( PULSE_STATE_PATH ) ) {
190+ try {
191+ const parsed = JSON . parse ( readFileSync ( PULSE_STATE_PATH , "utf-8" ) ) as { lastPollAt ?: string | null } ;
192+ pulseLastPoll = parsed . lastPollAt ?? null ;
193+ } catch {
194+ pulseLastPoll = null ;
195+ }
196+ }
197+
198+ return {
199+ stuckMailProcesses : readPids ( "tps mail send" ) . length ,
200+ agents,
201+ pulseRunning : readPids ( "pulse start" ) . length > 0 ,
202+ pulseLastPoll,
203+ } ;
204+ }
205+
149206function renderText ( result : OfficeHealthTickResult ) : string {
150207 const healthy = result . checkedAgents - result . staleAgents ;
151208 const staleList = result . agents
152209 . filter ( ( agent ) => agent . stale )
153210 . map ( ( agent ) => `${ agent . agentId } [${ agent . issues . map ( ( issue ) => issue . summary ) . join ( ", " ) } ]` )
154211 . join ( "; " ) ;
155- return [
212+ const lines = [
156213 `[${ result . timestamp } ] checked=${ result . checkedAgents } healthy=${ healthy } stale=${ result . staleAgents } published=${ result . publishedEvents } ` ,
157214 staleList ? `stale: ${ staleList } ` : "stale: none" ,
158- ] . join ( "\n" ) ;
215+ ] ;
216+
217+ if ( result . local ) {
218+ lines . push (
219+ `local: ${ result . local . agents
220+ . map ( ( agent ) => `${ agent . agentId } =${ agent . processCount } ${ agent . healthy ? "" : ` pids=[${ agent . pids . join ( "," ) } ]` } ` )
221+ . join ( " " ) } `
222+ ) ;
223+ lines . push (
224+ `local pulse: running=${ result . local . pulseRunning } lastPoll=${ result . local . pulseLastPoll ?? "missing" } stuckMail=${ result . local . stuckMailProcesses } `
225+ ) ;
226+ }
227+
228+ return lines . join ( "\n" ) ;
159229}
160230
161231export async function runOfficeHealthTick ( args : {
162232 viewerId : string ;
163233 flairUrl ?: string ;
164234 keyPath ?: string ;
235+ local ?: boolean ;
165236 nowMs ?: number ;
166237 state ?: HealthState ;
167238} ) : Promise < { result : OfficeHealthTickResult ; state : HealthState } > {
@@ -181,9 +252,7 @@ export async function runOfficeHealthTick(args: {
181252
182253 if ( record . stale ) {
183254 const summary = `${ agent . id } unhealthy: ${ record . issues . map ( ( issue ) => issue . summary ) . join ( ", " ) } ` ;
184- const detail = record . issues
185- . map ( ( issue ) => `${ issue . summary } ; ${ issue . detail } ` )
186- . join ( "\n" ) ;
255+ const detail = record . issues . map ( ( issue ) => `${ issue . summary } ; ${ issue . detail } ` ) . join ( "\n" ) ;
187256
188257 if ( ! prior ?. active ) {
189258 await flair . publishEvent ( {
@@ -215,6 +284,7 @@ export async function runOfficeHealthTick(args: {
215284 staleAgents : records . filter ( ( record ) => record . stale ) . length ,
216285 publishedEvents,
217286 agents : records . sort ( ( a , b ) => a . agentId . localeCompare ( b . agentId ) ) ,
287+ local : args . local ? checkLocalHealth ( ) : undefined ,
218288 } ;
219289
220290 return { result, state : nextState } ;
@@ -244,6 +314,7 @@ export async function runOfficeHealth(args: OfficeHealthArgs): Promise<void> {
244314 viewerId,
245315 flairUrl : args . flairUrl ?? process . env . FLAIR_URL ?? "http://127.0.0.1:9926" ,
246316 keyPath : args . keyPath ?? defaultFlairKeyPath ( viewerId ) ,
317+ local : args . local ,
247318 state,
248319 } ) ;
249320 state = tick . state ;
0 commit comments