falcosecurity · irozzo-1A · Mar 4, 2026 · Mar 6, 2026 · Mar 11, 2026 · Apr 10, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,6 +31,10 @@ option(USE_UBSAN "Build with UndefinedBehaviorSanitizer" OFF)
 option(UBSAN_HALT_ON_ERROR "Halt on error when building with UBSan" ON)
 option(USE_GPERFTOOLS "Build with gperftools CPU profiler support" OFF)
 option(USE_FRAME_POINTER "Build with frame pointers for accurate profiling" OFF)
+option(FALCO_MULTI_THREAD "Enable multi-threaded event processing (requires modern eBPF)" OFF)
+if(FALCO_MULTI_THREAD)
+	add_definitions(-DFALCO_MULTI_THREAD)
+endif()
 
 # Enable frame pointers by default when using gperftools for accurate stack traces
 if(USE_GPERFTOOLS AND NOT USE_FRAME_POINTER)

diff --git a/cmake/modules/CompilerFlags.cmake b/cmake/modules/CompilerFlags.cmake
@@ -68,6 +68,11 @@ if(NOT MSVC)
 		endif()
 	endif()
 
+	if(USE_TSAN)
+		set(FALCO_SECURITY_FLAGS "${FALCO_SECURITY_FLAGS} -fsanitize=thread")
+		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread")
+	endif()
+
 	if(USE_FRAME_POINTER)
 		set(FALCO_SECURITY_FLAGS "${FALCO_SECURITY_FLAGS} -fno-omit-frame-pointer")
 	endif()

diff --git a/cmake/modules/falcosecurity-libs.cmake b/cmake/modules/falcosecurity-libs.cmake
@@ -157,6 +157,20 @@ else()
 	message(STATUS "No strlcpy and strlcat found, will use local definition")
 endif()
 
+# Keep libs in sync with Falco's multi-thread option. Use FORCE so toggling FALCO_MULTI_THREAD
+# updates a previously cached ENABLE_MULTI_THREAD value.
+if(FALCO_MULTI_THREAD)
+	set(ENABLE_MULTI_THREAD
+		ON
+		CACHE BOOL "Multi-threaded libsinsp (from FALCO_MULTI_THREAD)" FORCE
+	)
+else()
+	set(ENABLE_MULTI_THREAD
+		OFF
+		CACHE BOOL "Multi-threaded libsinsp (from FALCO_MULTI_THREAD)" FORCE
+	)
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
 	include(driver)
 endif()

diff --git a/falco.yaml b/falco.yaml
@@ -438,6 +438,12 @@ engine:
     cpus_for_each_buffer: 2
     buf_size_preset: 4
     drop_failed_exit: false
+    # -- [EXPERIMENTAL] Parallel event processing is optional. When Falco and
+    # falcosecurity/libs are built with multi-thread support, you may add:
+    #   num_worker_threads: <N>
+    # with N > 1 for TGID-partitioned ring buffers (one worker per buffer); omit
+    # this key or use 0 for single-threaded mode. If libs lack multi-thread support,
+    # do not set this key (schema rejects it in those builds).
   # -- Engine-specific configuration for replay engine which replays a capture file.
   replay:
     # -- Path to the capture file to replay (eg: /path/to/file.scap)

diff --git a/proposals/20251205-multi-thread-falco-design-review.md b/proposals/20251205-multi-thread-falco-design-review.md
@@ -0,0 +1,257 @@
+# Review of Multi-Threaded Falco Design Proposal
+
+## Overall Assessment
+
+**Status**: Strong foundation with some areas needing clarification and expansion
+
+The proposal provides a solid high-level design for multi-threading in Falco. The analysis of partitioning strategies is thorough, and the choice of TGID partitioning is well-justified. However, several areas need more detail or clarification before implementation.
+
+---
+
+## Strengths
+
+1. **Clear Problem Statement**: The goal of addressing single CPU core saturation is well-defined
+2. **Comprehensive Trade-off Analysis**: Good comparison of TGID, TID, and pipelining approaches
+3. **Honest Risk Assessment**: Acknowledges complexity and potential pitfalls
+4. **Structured Approach**: Well-organized sections that build logically
+
+---
+
+## Critical Issues & Recommendations
+
+### 1. TGID Routing Implementation Details
+
+**Issue**: The proposal states routing happens "at the kernel driver level" but doesn't specify:
+- Whether routing occurs in eBPF code or userspace
+- How the hash function is implemented
+- Whether `num_workers` is fixed at initialization or dynamic
+- How ring buffers are allocated/managed per partition
+
+**Recommendation**: Add a section clarifying:
+- Routing mechanism (eBPF vs userspace)
+- Ring buffer allocation strategy
+- Hash function selection (e.g., Jenkins hash, simple modulo)
+- Dynamic worker scaling considerations
+
+### 2. Shared State Synchronization Design
+
+**Issue**: Line 32 mentions "lightweight synchronization mechanisms" but defers to a future document. This is critical for correctness.
+
+**Recommendation**: At minimum, specify:
+- Which data structures need synchronization (thread table, file descriptor tables, etc.)
+- Locking strategy (fine-grained locks, lock-free structures, RCU)
+- Performance targets (e.g., "synchronization overhead < 5% of event processing time")
+- Deadlock prevention strategy
+
+### 3. Load Imbalance Mitigation
+
+**Issue**: The "hot process" problem is acknowledged but mitigation strategies are limited.
+
+**Recommendation**: Consider adding:
+- **Work Stealing**: Allow idle workers to steal from busy partitions
+- **Dynamic Rebalancing**: Periodically reassign TGIDs to different partitions
+- **Metrics**: Define how to measure and detect load imbalance
+- **Thresholds**: Define when load imbalance becomes problematic
+
+### 4. Temporal Consistency Guarantees
+
+**Issue**: The synchronization point discussion (lines 88-90) is good but incomplete.
+
+**Recommendation**: Expand to cover:
+- **Maximum Wait Time**: What happens if parent event never arrives?
+- **Timeout Strategy**: How long to wait before falling back to last-resort fetching?
+- **Event Ordering Guarantees**: Document what ordering guarantees are provided vs. what's required
+- **Race Condition Examples**: Provide concrete examples of problematic scenarios
+
+### 5. Output Handling Thread Safety
+
+**Issue**: Line 5 mentions "output handling" but the proposal doesn't discuss how outputs (gRPC, files, stdout) are handled in a multi-threaded context.
+
+**Recommendation**: Add section covering:
+- Output queue design (per-thread vs. shared)
+- Output ordering guarantees (if any)
+- Thread-safe output mechanisms
+- Rate limiting in multi-threaded context
+
+### 6. Plugin Thread Safety
+
+**Issue**: Line 33 mentions plugins aren't thread-safe and defers to a future document, but this affects the entire design.
+
+**Recommendation**: At minimum, specify:
+- Whether plugins will be per-thread instances or shared with locks
+- Migration strategy for existing plugins
+- Timeline for plugin thread-safety requirements
+- Backward compatibility considerations
+
+### 7. Error Handling and Recovery
+
+**Issue**: No discussion of error handling when:
+- A worker thread crashes
+- Ring buffer overflows
+- Synchronization deadlocks occur
+- Parent thread information is permanently unavailable
+
+**Recommendation**: Add section on:
+- Worker thread failure recovery
+- Ring buffer overflow handling
+- Deadlock detection/prevention
+- Graceful degradation strategies
+
+### 8. Performance Metrics and Success Criteria
+
+**Issue**: No clear definition of success metrics.
+
+**Recommendation**: Define:
+- Target throughput improvement (e.g., "2x with 4 threads")
+- Acceptable synchronization overhead
+- Maximum acceptable event reordering window
+- Drop rate reduction targets
+
+### 9. Migration Path
+
+**Issue**: No discussion of how to transition from single-threaded to multi-threaded.
+
+**Recommendation**: Add section on:
+- Feature flag/configuration option
+- Backward compatibility
+- Rollback strategy
+- Testing approach
+
+### 10. Testing Strategy
+
+**Issue**: No mention of how to test multi-threaded correctness.
+
+**Recommendation**: Add section on:
+- Concurrency testing approaches
+- Race condition detection
+- Load imbalance testing
+- Temporal consistency validation
+
+---
+
+## Technical Concerns
+
+### 1. Ring Buffer Design
+
+**Question**: `BPF_MAP_TYPE_RINGBUF` doesn't have per-CPU variants. How will per-TGID ring buffers be implemented?
+- Multiple ring buffer maps?
+- Single ring buffer with routing in userspace?
+- Custom eBPF map type?
+
+**Recommendation**: Clarify the ring buffer architecture.
+
+### 2. Hash Function Quality
+
+**Issue**: Simple modulo on TGID may cause poor distribution if TGIDs are sequential or clustered.
+
+**Recommendation**: Specify a proper hash function (e.g., Jenkins hash, xxHash) to ensure good distribution.
+
+### 3. vfork() Handling
+
+**Issue**: Line 90 mentions vfork() as a special case but doesn't provide a solution.
+
+**Recommendation**: Either:
+- Specify the alternative synchronization point
+- Document that vfork() will use last-resort fetching
+- Consider adding clone enter parent event back
+
+### 4. Signal Handling
+
+**Issue**: Current codebase shows signal handling (SIGUSR1, SIGINT, SIGHUP) in the event loop. How will this work with multiple threads?
+
+**Recommendation**: Document signal handling strategy for multi-threaded context.
+
+---
+
+## Minor Issues
+
+1. **Line 19**: Image reference - ensure `images/falco-architecture.png` exists
+2. **Line 28**: Image reference - ensure `images/falco-multi-thread-architecture.png` exists
+3. **Line 30**: Typo: "writes event into" should be "writes events into"
+4. **Line 49**: "ring-buffer" vs "ring buffer" - be consistent with terminology
+5. **Line 63**: Consider adding a concrete example of a "hot process" scenario
+6. **Line 88**: "clone exit parent event" - consider adding a brief explanation of what this event represents
+
+---
+
+## Missing Sections
+
+1. **Architecture Diagram Details**: The proposed architecture diagram should show:
+   - Worker thread pool
+   - Shared state with synchronization points
+   - Output handling
+   - Event flow
+
+2. **Configuration**: How will users configure:
+   - Number of worker threads
+   - Synchronization strategy (blocking vs. deferring)
+   - Load balancing parameters
+
+3. **Monitoring**: What metrics will be exposed:
+   - Per-thread event rates
+   - Load imbalance metrics
+   - Synchronization wait times
+   - Deferred event queue sizes
+
+4. **Limitations**: Document known limitations:
+   - Maximum number of supported threads
+   - Performance degradation scenarios
+   - Unsupported use cases
+
+---
+
+## Suggestions for Improvement
+
+### 1. Add Implementation Phases
+
+Consider breaking the implementation into phases:
+- **Phase 1**: Basic TGID partitioning with blocking synchronization
+- **Phase 2**: Add deferring mechanism
+- **Phase 3**: Add signaling-based synchronization
+- **Phase 4**: Optimize with work stealing
+
+### 2. Add Performance Modeling
+
+Include expected performance characteristics:
+- Linear scaling up to N threads
+- Synchronization overhead estimates
+- Memory overhead per thread
+
+### 3. Add Comparison with Alternatives
+
+Consider briefly comparing with:
+- User-space event batching
+- Kernel-level improvements
+- Hardware offloading
+
+### 4. Add References
+
+Include references to:
+- Related work on multi-threaded event processing
+- BPF ring buffer documentation
+- Lock-free data structure papers (if applicable)
+
+---
+
+## Questions for Authors
+
+1. What is the expected timeline for the shared state synchronization design document?
+2. How will this interact with existing multi-source support (line 544-598 in process_events.cpp shows multi-source threading)?
+3. Will this be opt-in initially or default behavior?
+4. What is the minimum kernel version requirement for BPF_MAP_TYPE_RINGBUF?
+5. How will this affect capture mode (offline trace file processing)?
+
+---
+
+## Conclusion
+
+This is a solid high-level design that addresses a real performance problem. The TGID partitioning approach is well-reasoned, but the proposal needs more detail on:
+
+1. **Implementation specifics** (routing, ring buffers, synchronization)
+2. **Error handling and recovery**
+3. **Performance targets and metrics**
+4. **Migration and testing strategies**
+
+I recommend expanding the document with the sections above before proceeding to detailed design documents. The foundation is strong, but these details are critical for successful implementation.
+
+**Recommendation**: ✅ **Approve with revisions** - Address critical issues before implementation begins.