Fix: Prevent context-window bloat in ReWOO + summarization (#121)

rishikesh-jentic · web-flow · commit 3aba77dc60b3 · 2025-10-13T14:32:19.000+01:00
* context bloat mitigation rewoo

* context bloat mitigation rewoo : unit test

* context bloat mitigation rewoo : version upgrade
diff --git a/agents/reasoner/rewoo.py b/agents/reasoner/rewoo.py
@@ -180,9 +180,9 @@ def _execute(self, step: Step, state: ReasonerState) -> None:
 
         if step.output_key:
             self.memory[step.output_key] = step.result
-            state.history.append(f"remembered {step.output_key} : {step.result}")
 
-        state.history.append(f"Executed step: {step.text} -> {step.result}")
+        # Truncate step result to ~8KB to cap history growth and avoid context-window bloat
+        state.history.append(f"Executed step: {step.text} -> {str(step.result)[:8124]}")
         logger.info("step_executed", step_text=step.text, step_type=step_type, result=str(step.result)[:100] if step.result is not None else None)
 
     @observe
diff --git a/agents/standard_agent.py b/agents/standard_agent.py
@@ -101,7 +101,8 @@ def solve(self, goal: str) -> ReasoningResult:
 
         try:
             result = self.reasoner.run(goal)
-            result.final_answer = self.llm.prompt(_PROMPTS["summarize"].format(goal=goal, history=getattr(result, "transcript", "")))
+            # Truncate transcript to the last ~12KB to limit context size and avoid context-window errors
+            result.final_answer = self.llm.prompt(_PROMPTS["summarize"].format(goal=goal, history=getattr(result, "transcript", "")[-12000:]))
 
             self._record_interaction({"goal": goal, "result": result.final_answer})
             self._state = AgentState.READY
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "standard-agent"
-version = "0.1.10"
+version = "0.1.11"
 description = "A simple, modular library for building AI agents—with a composable core and plug‑in components."
 requires-python = ">=3.11"
 readme = "README.md"
diff --git a/tests/agents/reasoners/test_rewoo.py b/tests/agents/reasoners/test_rewoo.py
@@ -35,9 +35,9 @@ def test_rewoo_plan_parses_valid_bullets_and_records_successful_tool_call():
 
     # Successful tool call recorded once
     assert result.tool_calls and result.tool_calls[0] == {"tool_id": "t1", "summary": "Tool One"}
-    # Transcript contains remembered k1 and executed steps
-    assert "remembered k1" in result.transcript
+    # Transcript contains executed steps; memory has k1
     assert "Executed step:" in result.transcript
+    assert "k1" in memory
 
 
 def test_rewoo_plan_raises_on_input_before_output():
@@ -287,6 +287,44 @@ def test_rewoo_selection_invalid_id_records_no_tool_call():
     assert result.tool_calls == []
 
 
+def test_rewoo_history_truncates_step_result_to_8kb():
+    # Plan with one TOOL step producing a very large payload
+    plan_text = "- fetch big (output: k1)"
+    large_payload = "X" * 50000  # 50KB
+
+    class BigTool(DummyTool):
+        def __init__(self, tool_id: str, name: str):
+            super().__init__(tool_id, name, schema={})
+
+    class BigTools(DummyTools):
+        def execute(self, tool, params):  # type: ignore[override]
+            return large_payload
+
+    llm = DummyLLM(
+        text_queue=[
+            plan_text,  # plan
+            "TOOL",     # classify
+            "t1",       # select tool
+        ],
+        json_queue=[{}],  # params
+    )
+    tools = BigTools([BigTool("t1", "Big Tool")])
+    memory: Dict[str, Any] = DictMemory()
+
+    reasoner = ReWOOReasoner(llm=llm, tools=tools, memory=memory)
+    result = reasoner.run("goal")
+
+    # Transcript should contain the executed line with truncated payload (~8124 chars)
+    assert "Executed step:" in result.transcript
+    executed_lines = [ln for ln in result.transcript.split("\n") if ln.startswith("Executed step:")]
+    assert executed_lines, "Expected at least one executed step line"
+    line = executed_lines[-1]
+    # Ensure truncation happened (< original 50k)
+    assert len(line) < 20000
+    # And memory stores full payload (no truncation in memory)
+    assert memory.get("k1") == large_payload
+
+
 def test_rewoo_param_gen_error_triggers_reflection_and_no_tool_call():
     # Override LLM to raise a ValueError during param generation
     class FailParamLLM(DummyLLM):
diff --git a/tests/agents/test_standard_agent.py b/tests/agents/test_standard_agent.py
@@ -29,6 +29,17 @@ def run(self, goal: str) -> ReasoningResult:  # type: ignore[override]
         return ReasoningResult(transcript="trace", success=True)
 
 
+class LongTranscriptReasoner(BaseReasoner):
+    def __init__(self):
+        # type: ignore[call-arg]
+        pass
+
+    def run(self, goal: str) -> ReasoningResult:  # type: ignore[override]
+        # Generate a transcript > 50KB to ensure truncation to ~12KB occurs
+        long_trace = "Y" * 50000
+        return ReasoningResult(transcript=long_trace, success=True)
+
+
 class FailingReasoner(BaseReasoner):
     def __init__(self):
         # type: ignore[call-arg]
@@ -155,6 +166,32 @@ def run(self, goal: str) -> ReasoningResult:  # type: ignore[override]
     assert result.final_answer == "S"
 
 
+def test_agent_summarize_uses_only_last_12kb_of_transcript(monkeypatch):
+    _fixed_uuid4(monkeypatch, "RUN12K")
+
+    captured_prompt = {"text": None}
+
+    class CapturingLLM(DummyLLM):
+        def prompt(self, text: str) -> str:  # type: ignore[override]
+            captured_prompt["text"] = text
+            return "OK"
+
+    llm = CapturingLLM()
+    tools = DummyTools()
+    memory: Dict[str, Any] = DictMemory()
+    reasoner = LongTranscriptReasoner()
+
+    agent = StandardAgent(llm=llm, tools=tools, memory=memory, reasoner=reasoner)
+    agent.solve("g")
+
+    assert captured_prompt["text"] is not None
+    # Extract the history block from the summarize prompt
+    text = captured_prompt["text"] or ""
+    # The history is inserted via format(... history= ...), so ensure only ~12k included
+    assert len(text) < 30000  # entire prompt under 30k
+    assert "Y" * 20000 not in text  # definitely not the full 50k
+
+
 def test_agent_conversation_history_respects_window(monkeypatch):
     class SmallReasoner(BaseReasoner):
         def __init__(self):