feat(api): add periodic cleanup of stale Attack Paths scans with dead-worker detection (#10387)

josema-xyz · web-flow · commit cc197ea90106 · 2026-03-27T14:17:22.000+01:00
diff --git a/.github/workflows/api-codeql.yml b/.github/workflows/api-codeql.yml
@@ -50,6 +50,8 @@ jobs:
             github.com:443
             release-assets.githubusercontent.com:443
             uploads.github.com:443
+            release-assets.githubusercontent.com:443
+            objects.githubusercontent.com:443
 
       - name: Checkout repository
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
diff --git a/.github/workflows/api-container-checks.yml b/.github/workflows/api-container-checks.yml
@@ -86,6 +86,7 @@ jobs:
             production.cloudflare.docker.com:443
             debian.map.fastlydns.net:80
             release-assets.githubusercontent.com:443
+            objects.githubusercontent.com:443
             pypi.org:443
             files.pythonhosted.org:443
             www.powershellgallery.com:443
diff --git a/api/CHANGELOG.md b/api/CHANGELOG.md
@@ -30,6 +30,10 @@ All notable changes to the **Prowler API** are documented in this file.
 
 - Finding groups support `check_title` substring filtering [(#10377)](https://github.com/prowler-cloud/prowler/pull/10377)
 
+### 🔄 Changed
+
+- Attack Paths: Periodic cleanup of stale scans with dead-worker detection via Celery inspect, marking orphaned `EXECUTING` scans as `FAILED` and recovering `graph_data_ready` [(#10387)](https://github.com/prowler-cloud/prowler/pull/10387)
+
 ### 🐞 Fixed
 
 - Finding groups latest endpoint now aggregates the latest snapshot per provider before check-level totals, keeping impacted resources aligned across providers [(#10419)](https://github.com/prowler-cloud/prowler/pull/10419)
diff --git a/api/docker-entrypoint.sh b/api/docker-entrypoint.sh
@@ -30,9 +30,28 @@ start_prod_server() {
   poetry run gunicorn -c config/guniconf.py config.wsgi:application
 }
 
+resolve_worker_hostname() {
+  TASK_ID=""
+
+  if [ -n "$ECS_CONTAINER_METADATA_URI_V4" ]; then
+    TASK_ID=$(wget -qO- --timeout=2 "${ECS_CONTAINER_METADATA_URI_V4}/task" | \
+      python3 -c "import sys,json; print(json.load(sys.stdin)['TaskARN'].split('/')[-1])" 2>/dev/null)
+  fi
+
+  if [ -z "$TASK_ID" ]; then
+    TASK_ID=$(python3 -c "import uuid; print(uuid.uuid4().hex)")
+  fi
+
+  echo "${TASK_ID}@$(hostname)"
+}
+
 start_worker() {
   echo "Starting the worker..."
-  poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,scan-reports,deletion,backfill,overview,integrations,compliance,attack-paths-scans -E --max-tasks-per-child 1
+  poetry run python -m celery -A config.celery worker \
+    -n "$(resolve_worker_hostname)" \
+    -l "${DJANGO_LOGGING_LEVEL:-info}" \
+    -Q celery,scans,scan-reports,deletion,backfill,overview,integrations,compliance,attack-paths-scans \
+    -E --max-tasks-per-child 1
 }
 
 start_worker_beat() {
diff --git a/api/src/backend/api/migrations/0086_attack_paths_cleanup_periodic_task.py b/api/src/backend/api/migrations/0086_attack_paths_cleanup_periodic_task.py
@@ -0,0 +1,49 @@
+from django.db import migrations
+
+
+TASK_NAME = "attack-paths-cleanup-stale-scans"
+INTERVAL_HOURS = 1
+
+
+def create_periodic_task(apps, schema_editor):
+    IntervalSchedule = apps.get_model("django_celery_beat", "IntervalSchedule")
+    PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
+
+    schedule, _ = IntervalSchedule.objects.get_or_create(
+        every=INTERVAL_HOURS,
+        period="hours",
+    )
+
+    PeriodicTask.objects.update_or_create(
+        name=TASK_NAME,
+        defaults={
+            "task": TASK_NAME,
+            "interval": schedule,
+            "enabled": True,
+        },
+    )
+
+
+def delete_periodic_task(apps, schema_editor):
+    IntervalSchedule = apps.get_model("django_celery_beat", "IntervalSchedule")
+    PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
+
+    PeriodicTask.objects.filter(name=TASK_NAME).delete()
+
+    # Clean up the schedule if no other task references it
+    IntervalSchedule.objects.filter(
+        every=INTERVAL_HOURS,
+        period="hours",
+        periodictask__isnull=True,
+    ).delete()
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("api", "0085_finding_group_daily_summary_trgm_indexes"),
+        ("django_celery_beat", "0019_alter_periodictasks_options"),
+    ]
+
+    operations = [
+        migrations.RunPython(create_periodic_task, delete_periodic_task),
+    ]
diff --git a/api/src/backend/config/django/base.py b/api/src/backend/config/django/base.py
@@ -299,3 +299,8 @@
 # SAML requirement
 CSRF_COOKIE_SECURE = True
 SESSION_COOKIE_SECURE = True
+
+# Attack Paths
+ATTACK_PATHS_SCAN_STALE_THRESHOLD_MINUTES = env.int(
+    "ATTACK_PATHS_SCAN_STALE_THRESHOLD_MINUTES", 2880
+)  # 48h
diff --git a/api/src/backend/tasks/jobs/attack_paths/cleanup.py b/api/src/backend/tasks/jobs/attack_paths/cleanup.py
@@ -0,0 +1,152 @@
+from datetime import datetime, timedelta, timezone
+
+from celery import current_app, states
+from celery.utils.log import get_task_logger
+from config.django.base import ATTACK_PATHS_SCAN_STALE_THRESHOLD_MINUTES
+from tasks.jobs.attack_paths.db_utils import (
+    _mark_scan_finished,
+    recover_graph_data_ready,
+)
+
+from api.attack_paths import database as graph_database
+from api.db_router import MainRouter
+from api.db_utils import rls_transaction
+from api.models import AttackPathsScan, StateChoices
+
+logger = get_task_logger(__name__)
+
+
+def cleanup_stale_attack_paths_scans() -> dict:
+    """
+    Find `EXECUTING` `AttackPathsScan` scans whose workers are dead or that have
+    exceeded the stale threshold, and mark them as `FAILED`.
+
+    Two-pass detection:
+    1. If `TaskResult.worker` exists, ping the worker.
+       - Dead worker: cleanup immediately (any age).
+       - Alive + past threshold: revoke the task, then cleanup.
+       - Alive + within threshold: skip.
+    2. If no worker field: fall back to time-based heuristic only.
+    """
+    threshold = timedelta(minutes=ATTACK_PATHS_SCAN_STALE_THRESHOLD_MINUTES)
+    now = datetime.now(tz=timezone.utc)
+    cutoff = now - threshold
+
+    executing_scans = (
+        AttackPathsScan.all_objects.using(MainRouter.admin_db)
+        .filter(state=StateChoices.EXECUTING)
+        .select_related("task__task_runner_task")
+    )
+
+    # Cache worker liveness so each worker is pinged at most once
+    executing_scans = list(executing_scans)
+    workers = {
+        tr.worker
+        for scan in executing_scans
+        if (tr := getattr(scan.task, "task_runner_task", None) if scan.task else None)
+        and tr.worker
+    }
+    worker_alive = {w: _is_worker_alive(w) for w in workers}
+
+    cleaned_up = []
+
+    for scan in executing_scans:
+        task_result = (
+            getattr(scan.task, "task_runner_task", None) if scan.task else None
+        )
+        worker = task_result.worker if task_result else None
+
+        if worker:
+            alive = worker_alive.get(worker, True)
+
+            if alive:
+                if scan.started_at and scan.started_at >= cutoff:
+                    continue
+
+                # Alive but stale — revoke before cleanup
+                _revoke_task(task_result)
+                reason = (
+                    "Scan exceeded stale threshold — " "cleaned up by periodic task"
+                )
+            else:
+                reason = "Worker dead — cleaned up by periodic task"
+        else:
+            # No worker recorded — time-based heuristic only
+            if scan.started_at and scan.started_at >= cutoff:
+                continue
+            reason = (
+                "No worker recorded, scan exceeded stale threshold — "
+                "cleaned up by periodic task"
+            )
+
+        if _cleanup_scan(scan, task_result, reason):
+            cleaned_up.append(str(scan.id))
+
+    logger.info(
+        f"Stale `AttackPathsScan` cleanup: {len(cleaned_up)} scan(s) cleaned up"
+    )
+    return {"cleaned_up_count": len(cleaned_up), "scan_ids": cleaned_up}
+
+
+def _is_worker_alive(worker: str) -> bool:
+    """Ping a specific Celery worker. Returns `True` if it responds or on error."""
+    try:
+        response = current_app.control.inspect(destination=[worker], timeout=1.0).ping()
+        return response is not None and worker in response
+    except Exception:
+        logger.exception(f"Failed to ping worker {worker}, treating as alive")
+        return True
+
+
+def _revoke_task(task_result) -> None:
+    """Send `SIGTERM` to a hung Celery task. Non-fatal on failure."""
+    try:
+        current_app.control.revoke(
+            task_result.task_id, terminate=True, signal="SIGTERM"
+        )
+        logger.info(f"Revoked task {task_result.task_id}")
+    except Exception:
+        logger.exception(f"Failed to revoke task {task_result.task_id}")
+
+
+def _cleanup_scan(scan, task_result, reason: str) -> bool:
+    """
+    Clean up a single stale `AttackPathsScan`:
+    drop temp DB, mark `FAILED`, update `TaskResult`, recover `graph_data_ready`.
+
+    Returns `True` if the scan was actually cleaned up, `False` if skipped.
+    """
+    scan_id_str = str(scan.id)
+
+    # 1. Drop temp Neo4j database
+    tmp_db_name = graph_database.get_database_name(scan.id, temporary=True)
+    try:
+        graph_database.drop_database(tmp_db_name)
+    except Exception:
+        logger.exception(f"Failed to drop temp database {tmp_db_name}")
+
+    # 2. Lock row, verify still EXECUTING, mark FAILED — all atomic
+    with rls_transaction(str(scan.tenant_id)):
+        try:
+            fresh_scan = AttackPathsScan.objects.select_for_update().get(id=scan.id)
+        except AttackPathsScan.DoesNotExist:
+            logger.warning(f"Scan {scan_id_str} no longer exists, skipping")
+            return False
+
+        if fresh_scan.state != StateChoices.EXECUTING:
+            logger.info(f"Scan {scan_id_str} is now {fresh_scan.state}, skipping")
+            return False
+
+        _mark_scan_finished(fresh_scan, StateChoices.FAILED, {"global_error": reason})
+
+    # 3. Mark `TaskResult` as `FAILURE` (not RLS-protected, outside lock)
+    if task_result:
+        task_result.status = states.FAILURE
+        task_result.date_done = datetime.now(tz=timezone.utc)
+        task_result.save(update_fields=["status", "date_done"])
+
+    # 4. Recover graph_data_ready if provider data still exists
+    recover_graph_data_ready(fresh_scan)
+
+    logger.info(f"Cleaned up stale scan {scan_id_str}: {reason}")
+    return True
diff --git a/api/src/backend/tasks/jobs/attack_paths/db_utils.py b/api/src/backend/tasks/jobs/attack_paths/db_utils.py
@@ -88,34 +88,41 @@ def starting_attack_paths_scan(
         )
 
 
-def finish_attack_paths_scan(
+def _mark_scan_finished(
     attack_paths_scan: ProwlerAPIAttackPathsScan,
     state: StateChoices,
     ingestion_exceptions: dict[str, Any],
 ) -> None:
-    with rls_transaction(attack_paths_scan.tenant_id):
-        now = datetime.now(tz=timezone.utc)
-        duration = (
-            int((now - attack_paths_scan.started_at).total_seconds())
-            if attack_paths_scan.started_at
-            else 0
-        )
+    """Set terminal fields on a scan. Caller must be inside a transaction."""
+    now = datetime.now(tz=timezone.utc)
+    duration = (
+        int((now - attack_paths_scan.started_at).total_seconds())
+        if attack_paths_scan.started_at
+        else 0
+    )
+    attack_paths_scan.state = state
+    attack_paths_scan.progress = 100
+    attack_paths_scan.completed_at = now
+    attack_paths_scan.duration = duration
+    attack_paths_scan.ingestion_exceptions = ingestion_exceptions
+    attack_paths_scan.save(
+        update_fields=[
+            "state",
+            "progress",
+            "completed_at",
+            "duration",
+            "ingestion_exceptions",
+        ]
+    )
 
-        attack_paths_scan.state = state
-        attack_paths_scan.progress = 100
-        attack_paths_scan.completed_at = now
-        attack_paths_scan.duration = duration
-        attack_paths_scan.ingestion_exceptions = ingestion_exceptions
 
-        attack_paths_scan.save(
-            update_fields=[
-                "state",
-                "progress",
-                "completed_at",
-                "duration",
-                "ingestion_exceptions",
-            ]
-        )
+def finish_attack_paths_scan(
+    attack_paths_scan: ProwlerAPIAttackPathsScan,
+    state: StateChoices,
+    ingestion_exceptions: dict[str, Any],
+) -> None:
+    with rls_transaction(attack_paths_scan.tenant_id):
+        _mark_scan_finished(attack_paths_scan, state, ingestion_exceptions)
 
 
 def update_attack_paths_scan_progress(
@@ -194,25 +201,26 @@ def fail_attack_paths_scan(
     Used as a safety net when the Celery task fails outside the job's own error handling.
     """
     attack_paths_scan = retrieve_attack_paths_scan(tenant_id, scan_id)
-    if attack_paths_scan and attack_paths_scan.state not in (
-        StateChoices.COMPLETED,
-        StateChoices.FAILED,
-    ):
-        tmp_db_name = graph_database.get_database_name(
-            attack_paths_scan.id, temporary=True
+    if not attack_paths_scan:
+        return
+
+    tmp_db_name = graph_database.get_database_name(attack_paths_scan.id, temporary=True)
+    try:
+        graph_database.drop_database(tmp_db_name)
+    except Exception:
+        logger.exception(
+            f"Failed to drop temp database {tmp_db_name} during failure handling"
         )
-        try:
-            graph_database.drop_database(tmp_db_name)
 
-        except Exception:
-            logger.exception(
-                f"Failed to drop temp database {tmp_db_name} during failure handling"
+    with rls_transaction(tenant_id):
+        try:
+            fresh = ProwlerAPIAttackPathsScan.objects.select_for_update().get(
+                id=attack_paths_scan.id
             )
+        except ProwlerAPIAttackPathsScan.DoesNotExist:
+            return
+        if fresh.state in (StateChoices.COMPLETED, StateChoices.FAILED):
+            return
+        _mark_scan_finished(fresh, StateChoices.FAILED, {"global_error": error})
 
-        finish_attack_paths_scan(
-            attack_paths_scan,
-            StateChoices.FAILED,
-            {"global_error": error},
-        )
-
-        recover_graph_data_ready(attack_paths_scan)
+    recover_graph_data_ready(fresh)
diff --git a/api/src/backend/tasks/tasks.py b/api/src/backend/tasks/tasks.py
@@ -13,6 +13,7 @@
     can_provider_run_attack_paths_scan,
 )
 from tasks.jobs.attack_paths import db_utils as attack_paths_db_utils
+from tasks.jobs.attack_paths.cleanup import cleanup_stale_attack_paths_scans
 from tasks.jobs.backfill import (
     backfill_compliance_summaries,
     backfill_daily_severity_summaries,
@@ -406,6 +407,11 @@ def perform_attack_paths_scan_task(self, tenant_id: str, scan_id: str):
     )
 
 
+@shared_task(name="attack-paths-cleanup-stale-scans", queue="attack-paths-scans")
+def cleanup_stale_attack_paths_scans_task():
+    return cleanup_stale_attack_paths_scans()
+
+
 @shared_task(name="tenant-deletion", queue="deletion", autoretry_for=(Exception,))
 def delete_tenant_task(tenant_id: str):
     return delete_tenant(pk=tenant_id)
diff --git a/api/src/backend/tasks/tests/test_attack_paths_scan.py b/api/src/backend/tasks/tests/test_attack_paths_scan.py