Skip to content

Commit 0698f77

Browse files
committed
Fix health check false positives, ClientDisconnect spam, and Stripe API burst
- Increase HEALTH_CHECK_TIMEOUT 10->30s, MAX_FAILURES 3->5 to prevent killing healthy servers during load spikes - Handle ClientDisconnect in DiscordErrorMiddleware - skip Discord error reporting for normal client disconnections (was firing every minute) - Add set_if_not_exists to SharedCache for atomic lock operations - Use SETNX lock in background Stripe check to prevent race condition where multiple workers all start Stripe checks simultaneously (reduced 234 Stripe API calls to 78 per cache-miss cycle)
1 parent ed0a10e commit 0698f77

3 files changed

Lines changed: 50 additions & 0 deletions

File tree

agixt/MagicalAuth.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5667,6 +5667,13 @@ def _background_stripe_subscription_check(
56675667
logging.debug(f"Stripe check cache hit for {user_email}")
56685668
return
56695669

5670+
# Use a lock key to prevent multiple workers from checking simultaneously
5671+
lock_key = f"stripe_lock:{user_id}"
5672+
if not shared_cache.set_if_not_exists(lock_key, "1", ttl=30):
5673+
# Another worker is already checking, skip
5674+
logging.debug(f"Stripe check already in progress for {user_email}")
5675+
return
5676+
56705677
stripe.api_key = api_key
56715678

56725679
session = get_session()

agixt/SharedCache.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,40 @@ def set(self, key: str, value: Any, ttl: int = 0) -> bool:
201201
}
202202
return True
203203

204+
def set_if_not_exists(self, key: str, value: Any, ttl: int = 0) -> bool:
205+
"""
206+
Set a value only if the key does not already exist (atomic).
207+
Returns True if the value was set, False if the key already existed.
208+
"""
209+
full_key = self._make_key(key)
210+
211+
try:
212+
serialized = json.dumps(value)
213+
except (TypeError, ValueError):
214+
return False
215+
216+
if self._redis is not None:
217+
try:
218+
if ttl > 0:
219+
result = self._redis.set(full_key, serialized, nx=True, ex=ttl)
220+
else:
221+
result = self._redis.set(full_key, serialized, nx=True)
222+
return bool(result)
223+
except Exception as e:
224+
logger.debug(f"SharedCache Redis setnx error: {e}")
225+
226+
# Local cache fallback
227+
with self._local_cache_lock:
228+
if full_key in self._local_cache:
229+
entry = self._local_cache[full_key]
230+
if entry["expires_at"] is None or time.time() <= entry["expires_at"]:
231+
return False
232+
self._local_cache[full_key] = {
233+
"value": value,
234+
"expires_at": time.time() + ttl if ttl > 0 else None,
235+
}
236+
return True
237+
204238
def delete(self, key: str) -> bool:
205239
"""
206240
Delete a key from the cache.

agixt/middleware.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,15 @@ async def dispatch(self, request: Request, call_next):
570570
response = await call_next(request)
571571
return response
572572
except Exception as e:
573+
# Ignore ClientDisconnect - these are normal when clients close connections
574+
from starlette.requests import ClientDisconnect
575+
576+
if isinstance(e, ClientDisconnect) or "ClientDisconnect" in str(type(e)):
577+
self.logger.debug(
578+
f"Client disconnected during {request.method} {request.url.path}"
579+
)
580+
raise
581+
573582
# Log the error locally
574583
self.logger.error(
575584
f"Unhandled exception in {request.method} {request.url.path}: {e}"

0 commit comments

Comments
 (0)