Skip to content

Commit df48e29

Browse files
Add CUDA health checking to /healthz endpoint (#2204)
* Add CUDA health checking to /healthz endpoint The /healthz endpoint now verifies CUDA context health when running on GPU by calling torch.cuda.synchronize() (surfaces async errors) and torch.cuda.mem_get_info() (verifies runtime). Returns 503 when CUDA is corrupted. Failure state is cached permanently since CUDA context corruption is unrecoverable -- subsequent health checks return instantly without touching CUDA. On CPU-only servers, behaves exactly as before. * Remove CUDA error detail from /healthz response body Log the error server-side instead of exposing it in the HTTP response. Addresses CodeQL information-exposure-through-an-exception finding. --------- Co-authored-by: Paweł Pęczek <[email protected]>
1 parent dad37aa commit df48e29

3 files changed

Lines changed: 249 additions & 2 deletions

File tree

inference/core/interfaces/http/http_api.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2249,8 +2249,25 @@ def readiness(
22492249

22502250
@app.get("/healthz", status_code=200)
22512251
def healthz():
2252-
"""Health endpoint for Kubernetes liveness probe."""
2253-
return {"status": "healthy"}
2252+
"""Health endpoint for Kubernetes liveness probe.
2253+
2254+
Verifies CUDA context health when running on GPU. Returns 503 if
2255+
CUDA is corrupted (unrecoverable - requires process restart).
2256+
"""
2257+
from inference.core.utils.cuda_health import check_cuda_health
2258+
2259+
is_healthy, error = check_cuda_health()
2260+
if is_healthy:
2261+
return {"status": "healthy"}
2262+
else:
2263+
logger.error("CUDA health check failed: %s", error)
2264+
return JSONResponse(
2265+
content={
2266+
"status": "unhealthy",
2267+
"reason": "cuda_error",
2268+
},
2269+
status_code=503,
2270+
)
22542271

22552272
if CORE_MODELS_ENABLED:
22562273
if CORE_MODEL_CLIP_ENABLED:
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""CUDA health checking utilities.
2+
3+
Provides a fast, cached health check for GPU/CUDA state. Once CUDA fails,
4+
the context is permanently corrupted and cannot recover without process restart.
5+
The failure state is cached to avoid repeatedly calling into a broken CUDA runtime.
6+
"""
7+
8+
import logging
9+
import threading
10+
import time
11+
from typing import Optional, Tuple
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
class CudaHealthChecker:
17+
"""Thread-safe CUDA health checker with failure caching.
18+
19+
Once a CUDA failure is detected, the result is cached permanently
20+
(CUDA context corruption is unrecoverable). Subsequent calls return
21+
the cached failure immediately without touching CUDA.
22+
"""
23+
24+
def __init__(self):
25+
self._lock = threading.Lock()
26+
self._cuda_failed: bool = False
27+
self._failure_error: Optional[str] = None
28+
self._failure_time: Optional[float] = None
29+
self._gpu_available: Optional[bool] = None # None = not yet checked
30+
31+
def _is_gpu_environment(self) -> bool:
32+
"""Check if we're running in a GPU environment. Cached after first call."""
33+
if self._gpu_available is not None:
34+
return self._gpu_available
35+
try:
36+
import torch
37+
38+
self._gpu_available = torch.cuda.is_available()
39+
except ImportError:
40+
self._gpu_available = False
41+
except Exception:
42+
self._gpu_available = False
43+
return self._gpu_available
44+
45+
def check_health(self) -> Tuple[bool, Optional[str]]:
46+
"""Check CUDA health. Returns (is_healthy, error_message).
47+
48+
- If not a GPU environment: returns (True, None) immediately
49+
- If CUDA previously failed: returns cached failure immediately
50+
- Otherwise: runs synchronize + mem_get_info check
51+
52+
Thread-safe. The actual CUDA check is serialized by the lock to
53+
prevent concurrent CUDA calls during health checking.
54+
"""
55+
# Fast path: not a GPU environment
56+
if not self._is_gpu_environment():
57+
return True, None
58+
59+
# Fast path: already known to be failed (unrecoverable)
60+
if self._cuda_failed:
61+
return False, self._failure_error
62+
63+
# Slow path: actually check CUDA
64+
with self._lock:
65+
# Double-check after acquiring lock
66+
if self._cuda_failed:
67+
return False, self._failure_error
68+
69+
try:
70+
import torch
71+
72+
# Synchronize to surface any pending async CUDA errors
73+
torch.cuda.synchronize()
74+
# Query runtime to verify it's still functional
75+
torch.cuda.mem_get_info()
76+
return True, None
77+
except Exception as e:
78+
error_msg = f"CUDA health check failed: {e}"
79+
logger.error(error_msg)
80+
self._cuda_failed = True
81+
self._failure_error = error_msg
82+
self._failure_time = time.time()
83+
return False, error_msg
84+
85+
@property
86+
def is_failed(self) -> bool:
87+
return self._cuda_failed
88+
89+
@property
90+
def failure_info(self) -> Optional[dict]:
91+
if not self._cuda_failed:
92+
return None
93+
return {
94+
"error": self._failure_error,
95+
"failed_at": self._failure_time,
96+
}
97+
98+
99+
# Module-level singleton
100+
_checker = CudaHealthChecker()
101+
102+
103+
def check_cuda_health() -> Tuple[bool, Optional[str]]:
104+
"""Module-level convenience function."""
105+
return _checker.check_health()
106+
107+
108+
def get_cuda_health_checker() -> CudaHealthChecker:
109+
"""Return the singleton for dependency injection / testing."""
110+
return _checker
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
from unittest.mock import MagicMock, patch
2+
3+
import pytest
4+
5+
from inference.core.utils.cuda_health import CudaHealthChecker
6+
7+
8+
class TestCudaHealthChecker:
9+
def setup_method(self):
10+
"""Create a fresh checker for each test."""
11+
self.checker = CudaHealthChecker()
12+
13+
def test_cpu_environment_no_torch(self):
14+
"""When torch is not installed, should always return healthy."""
15+
with patch.dict("sys.modules", {"torch": None}):
16+
self.checker._gpu_available = None # reset cache
17+
is_healthy, error = self.checker.check_health()
18+
assert is_healthy is True
19+
assert error is None
20+
21+
def test_cpu_environment_no_cuda(self):
22+
"""When torch is available but CUDA is not, should return healthy."""
23+
mock_torch = MagicMock()
24+
mock_torch.cuda.is_available.return_value = False
25+
with patch.dict("sys.modules", {"torch": mock_torch}):
26+
self.checker._gpu_available = None
27+
is_healthy, error = self.checker.check_health()
28+
assert is_healthy is True
29+
assert error is None
30+
31+
def test_healthy_gpu(self):
32+
"""When CUDA operations succeed, should return healthy."""
33+
mock_torch = MagicMock()
34+
mock_torch.cuda.is_available.return_value = True
35+
mock_torch.cuda.synchronize.return_value = None
36+
mock_torch.cuda.mem_get_info.return_value = (4_000_000_000, 8_000_000_000)
37+
38+
self.checker._gpu_available = True
39+
with patch.dict("sys.modules", {"torch": mock_torch}):
40+
is_healthy, error = self.checker.check_health()
41+
assert is_healthy is True
42+
assert error is None
43+
mock_torch.cuda.synchronize.assert_called_once()
44+
mock_torch.cuda.mem_get_info.assert_called_once()
45+
46+
def test_cuda_synchronize_failure(self):
47+
"""When torch.cuda.synchronize() fails, should detect CUDA corruption."""
48+
mock_torch = MagicMock()
49+
mock_torch.cuda.is_available.return_value = True
50+
mock_torch.cuda.synchronize.side_effect = RuntimeError(
51+
"CUDA error: an illegal memory access was encountered"
52+
)
53+
54+
self.checker._gpu_available = True
55+
with patch.dict("sys.modules", {"torch": mock_torch}):
56+
is_healthy, error = self.checker.check_health()
57+
assert is_healthy is False
58+
assert "illegal memory access" in error
59+
assert self.checker.is_failed is True
60+
61+
def test_mem_get_info_failure(self):
62+
"""When mem_get_info fails (after synchronize succeeds), should detect failure."""
63+
mock_torch = MagicMock()
64+
mock_torch.cuda.is_available.return_value = True
65+
mock_torch.cuda.synchronize.return_value = None
66+
mock_torch.cuda.mem_get_info.side_effect = RuntimeError("CUDA runtime error")
67+
68+
self.checker._gpu_available = True
69+
with patch.dict("sys.modules", {"torch": mock_torch}):
70+
is_healthy, error = self.checker.check_health()
71+
assert is_healthy is False
72+
assert "CUDA runtime error" in error
73+
74+
def test_failure_is_cached(self):
75+
"""After first CUDA failure, subsequent checks should return cached failure
76+
without calling torch again."""
77+
mock_torch = MagicMock()
78+
mock_torch.cuda.is_available.return_value = True
79+
mock_torch.cuda.synchronize.side_effect = RuntimeError("CUDA error")
80+
81+
self.checker._gpu_available = True
82+
with patch.dict("sys.modules", {"torch": mock_torch}):
83+
# First call: detects failure
84+
is_healthy1, error1 = self.checker.check_health()
85+
assert is_healthy1 is False
86+
assert mock_torch.cuda.synchronize.call_count == 1
87+
88+
# Second call: returns cached failure, no new CUDA calls
89+
mock_torch.cuda.synchronize.reset_mock()
90+
is_healthy2, error2 = self.checker.check_health()
91+
assert is_healthy2 is False
92+
assert error2 == error1
93+
mock_torch.cuda.synchronize.assert_not_called()
94+
95+
def test_failure_info(self):
96+
"""failure_info should return error details after failure."""
97+
assert self.checker.failure_info is None
98+
99+
mock_torch = MagicMock()
100+
mock_torch.cuda.is_available.return_value = True
101+
mock_torch.cuda.synchronize.side_effect = RuntimeError("CUDA error")
102+
103+
self.checker._gpu_available = True
104+
with patch.dict("sys.modules", {"torch": mock_torch}):
105+
self.checker.check_health()
106+
107+
info = self.checker.failure_info
108+
assert info is not None
109+
assert "CUDA error" in info["error"]
110+
assert info["failed_at"] is not None
111+
112+
def test_gpu_available_is_cached(self):
113+
"""_is_gpu_environment() should only check torch once."""
114+
mock_torch = MagicMock()
115+
mock_torch.cuda.is_available.return_value = False
116+
with patch.dict("sys.modules", {"torch": mock_torch}):
117+
assert self.checker._is_gpu_environment() is False
118+
assert self.checker._is_gpu_environment() is False
119+
# Only called once despite two invocations
120+
mock_torch.cuda.is_available.assert_called_once()

0 commit comments

Comments
 (0)