fix(long-task): add debug logging for step-level observability

2026-05-20 16:42:25 +00:00 · 2026-04-27 00:39:45 +08:00 · 2026-04-27 00:39:45 +08:00 · e7214d96ed
commit e7214d96ed
parent bf5762a3d4
2 changed files with 70 additions and 16 deletions
--- a/nanobot/agent/tools/long_task.py
+++ b/nanobot/agent/tools/long_task.py
@ -40,8 +40,9 @@ class HandoffTool(Tool):
    @property
    def description(self) -> str:
        return (
-            "You are done with this step. Pass control to the next step. "
-            "You MUST call this (or complete()) before your tool budget runs out."
+            "REQUIRED after finishing your work in this step. "
+            "Pass your progress summary to the next step. "
+            "Use complete() instead if the entire goal is achieved."
        )

    async def execute(self, message: str, **kwargs: Any) -> str:
@ -88,15 +89,16 @@ _STEP_BUDGET = 8
 _BUDGET_EXHAUSTED_PREFIX = "Tool budget exhausted"

 _LONG_TASK_SYSTEM_PROMPT = """\
-You are one step in a chain. Do a small chunk of work, then call handoff().
+You are one step in a chain working toward a goal.

-1. Check the filesystem to see what's already done (ignore handoff notes).
-2. Do the next small piece of work.
-3. Call handoff() with what you did and where results are saved. \
-If everything is truly done, call complete() instead.
+1. Check the filesystem to see what's already done.
+2. Do the next piece of work. Write results to files as you go — \
+do NOT just collect information without producing output.
+3. When done with your chunk, call handoff() with a brief summary. \
+If the entire goal is finished, call complete() instead.

-You have very few tool calls. Do NOT try to finish everything. \
-Do one chunk, call handoff(), done.
+IMPORTANT: Write output to files early and often. If you run out of \
+tool calls, only what's on the filesystem survives.
 """


@ -105,8 +107,7 @@ def _build_user_message(goal: str, step: int, handoff: str) -> str:
    budget_note = (
        f"\n\n---\n"
        f"Step {step + 1}. You have {_STEP_BUDGET} tool calls total. "
-        f"Reserve the last 1-2 calls for handoff() or complete(). "
-        f"If you run out of calls without calling one, your progress is LOST."
+        f"Call handoff() or complete() before you run out."
    )
    if step == 0:
        return goal + budget_note
@ -177,6 +178,7 @@ class LongTaskTool(Tool):

    async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
        handoff = ""
+        logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
        for step in range(max_steps):
            signal_store: dict[str, str] = {}
            user_msg = _build_user_message(goal, step, handoff)
@ -195,19 +197,39 @@ class LongTaskTool(Tool):
                    )
                return f"Long task failed at step {step + 1}/{max_steps}."
            sig_type = signal_store.get("type")
+            sig_payload = signal_store.get("payload", "")
            logger.info(
                "long_task step {}/{}: signal={}, stop_reason={}, tools={}",
-                step + 1, max_steps, sig_type or "none",
+                step + 1, max_steps, sig_type or "auto",
                result.stop_reason,
                result.tools_used,
            )
            if sig_type == "complete":
-                return signal_store["payload"]
-            elif sig_type == "handoff":
-                handoff = signal_store["payload"]
+                logger.debug(
+                    "long_task done at step {}: complete payload={:.200}",
+                    step + 1, sig_payload,
+                )
+                return sig_payload
+
+            # Auto-extract progress — don't require handoff()
+            if sig_type == "handoff":
+                handoff = sig_payload
+                logger.debug("long_task step {} handoff: {:.200}", step + 1, handoff)
+            elif result.stop_reason == "completed":
+                # Subagent returned text naturally (no more tool calls)
+                handoff = result.final_content or ""
+                logger.debug(
+                    "long_task step {} natural end: {:.200}",
+                    step + 1, handoff[:200] if handoff else "(empty)",
+                )
            else:
-                # No signal tool called — extract useful content as fallback
+                # max_iterations hit — extract whatever text the subagent produced
                handoff = _extract_handoff_from_messages(result.messages)
+                logger.debug(
+                    "long_task step {} auto-extract: {:.200}",
+                    step + 1, handoff[:200] if handoff else "(empty)",
+                )
+        logger.warning("long_task exhausted max_steps={}", max_steps)
        return (
            f"Long task reached max steps ({max_steps}). "
            f"Last progress:\n{handoff}"
--- a/tests/agent/tools/test_long_task.py
+++ b/tests/agent/tools/test_long_task.py
@ -158,6 +158,38 @@ async def test_long_task_fallback_when_no_signal_called():
    assert "I processed items 1-5" in result


+@pytest.mark.asyncio
+async def test_long_task_auto_extracts_on_natural_end():
+    """Subagent finishes naturally (stop_reason=completed) without calling signal."""
+    from nanobot.agent.tools.long_task import LongTaskTool
+
+    mgr = _make_manager_stub()
+    steps = 0
+
+    async def fake_run_step(*, system_prompt, user_message, extra_tools):
+        nonlocal steps
+        steps += 1
+        if steps == 1:
+            return _step_result(
+                final_content="I processed items 1-5. Results in out.md.",
+                stop_reason="completed",
+            )
+        # Second step: subagent calls complete
+        for t in extra_tools:
+            if t.name == "complete":
+                await t.execute(summary="All done.")
+        return _step_result(
+            final_content="All done.",
+            tools_used=["complete"],
+        )
+
+    mgr.run_step.side_effect = fake_run_step
+    tool = LongTaskTool(manager=mgr)
+    result = await tool.execute(goal="Process items.", max_steps=5)
+    assert "All done." == result
+    assert steps == 2
+
+
@pytest.mark.asyncio
 async def test_long_task_goal_appears_in_system_prompt():
    """Verify every step's system_prompt contains the long task system prompt."""