fix(long-task): add debug logging for step-level observability

2026-05-20 00:22:31 +00:00 · 2026-04-27 00:39:45 +08:00 · 2026-04-27 00:39:45 +08:00 · e7214d96ed
commit e7214d96ed
parent bf5762a3d4
2 changed files with 70 additions and 16 deletions
--- a/nanobot/agent/tools/long_task.py
+++ b/nanobot/agent/tools/long_task.py
@ -40,8 +40,9 @@ class HandoffTool(Tool):
    @property
    def description(self) -> str:
        return (
-            "You are done with this step. Pass control to the next step. "
+            "REQUIRED after finishing your work in this step. "
-            "You MUST call this (or complete()) before your tool budget runs out."
+            "Pass your progress summary to the next step. "
            "Use complete() instead if the entire goal is achieved."
        )
    async def execute(self, message: str, **kwargs: Any) -> str:
@ -88,15 +89,16 @@ _STEP_BUDGET = 8
 _BUDGET_EXHAUSTED_PREFIX = "Tool budget exhausted"
 _LONG_TASK_SYSTEM_PROMPT = """\
-You are one step in a chain. Do a small chunk of work, then call handoff().
+You are one step in a chain working toward a goal.
-1. Check the filesystem to see what's already done (ignore handoff notes).
+1. Check the filesystem to see what's already done.
-2. Do the next small piece of work.
+2. Do the next piece of work. Write results to files as you go — \
-3. Call handoff() with what you did and where results are saved. \
+do NOT just collect information without producing output.
-If everything is truly done, call complete() instead.
+3. When done with your chunk, call handoff() with a brief summary. \
 If the entire goal is finished, call complete() instead.
-You have very few tool calls. Do NOT try to finish everything. \
+IMPORTANT: Write output to files early and often. If you run out of \
-Do one chunk, call handoff(), done.
+tool calls, only what's on the filesystem survives.
 """
@ -105,8 +107,7 @@ def _build_user_message(goal: str, step: int, handoff: str) -> str:
    budget_note = (
        f"\n\n---\n"
        f"Step {step + 1}. You have {_STEP_BUDGET} tool calls total. "
-        f"Reserve the last 1-2 calls for handoff() or complete(). "
+        f"Call handoff() or complete() before you run out."
        f"If you run out of calls without calling one, your progress is LOST."
    )
    if step == 0:
        return goal + budget_note
@ -177,6 +178,7 @@ class LongTaskTool(Tool):
    async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
        handoff = ""
        logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
        for step in range(max_steps):
            signal_store: dict[str, str] = {}
            user_msg = _build_user_message(goal, step, handoff)
@ -195,19 +197,39 @@ class LongTaskTool(Tool):
                    )
                return f"Long task failed at step {step + 1}/{max_steps}."
            sig_type = signal_store.get("type")
            sig_payload = signal_store.get("payload", "")
            logger.info(
                "long_task step {}/{}: signal={}, stop_reason={}, tools={}",
-                step + 1, max_steps, sig_type or "none",
+                step + 1, max_steps, sig_type or "auto",
                result.stop_reason,
                result.tools_used,
            )
            if sig_type == "complete":
-                return signal_store["payload"]
+                logger.debug(
-            elif sig_type == "handoff":
+                    "long_task done at step {}: complete payload={:.200}",
-                handoff = signal_store["payload"]
+                    step + 1, sig_payload,
                )
                return sig_payload
            # Auto-extract progress — don't require handoff()
            if sig_type == "handoff":
                handoff = sig_payload
                logger.debug("long_task step {} handoff: {:.200}", step + 1, handoff)
            elif result.stop_reason == "completed":
                # Subagent returned text naturally (no more tool calls)
                handoff = result.final_content or ""
                logger.debug(
                    "long_task step {} natural end: {:.200}",
                    step + 1, handoff[:200] if handoff else "(empty)",
                )
            else:
-                # No signal tool called — extract useful content as fallback
+                # max_iterations hit — extract whatever text the subagent produced
                handoff = _extract_handoff_from_messages(result.messages)
                logger.debug(
                    "long_task step {} auto-extract: {:.200}",
                    step + 1, handoff[:200] if handoff else "(empty)",
                )
        logger.warning("long_task exhausted max_steps={}", max_steps)
        return (
            f"Long task reached max steps ({max_steps}). "
            f"Last progress:\n{handoff}"
--- a/tests/agent/tools/test_long_task.py
+++ b/tests/agent/tools/test_long_task.py
@ -158,6 +158,38 @@ async def test_long_task_fallback_when_no_signal_called():
    assert "I processed items 1-5" in result
@pytest.mark.asyncio
 async def test_long_task_auto_extracts_on_natural_end():
    """Subagent finishes naturally (stop_reason=completed) without calling signal."""
    from nanobot.agent.tools.long_task import LongTaskTool
    mgr = _make_manager_stub()
    steps = 0
    async def fake_run_step(*, system_prompt, user_message, extra_tools):
        nonlocal steps
        steps += 1
        if steps == 1:
            return _step_result(
                final_content="I processed items 1-5. Results in out.md.",
                stop_reason="completed",
            )
        # Second step: subagent calls complete
        for t in extra_tools:
            if t.name == "complete":
                await t.execute(summary="All done.")
        return _step_result(
            final_content="All done.",
            tools_used=["complete"],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    result = await tool.execute(goal="Process items.", max_steps=5)
    assert "All done." == result
    assert steps == 2
@pytest.mark.asyncio
 async def test_long_task_goal_appears_in_system_prompt():
    """Verify every step's system_prompt contains the long task system prompt."""