feat(long-task): major overhaul with structured handoffs, validation, and observability

- Structured HandoffState: HandoffTool now accepts files_created, files_modified, next_step_hint, and verification fields instead of a plain string. Progress is passed between steps as structured data. - Completion validation round: After complete() is called, a dedicated validator step runs to verify the claim against the original goal. If validation fails, the task continues rather than returning a false completion. - Dynamic prompt system: 3 Jinja2 templates (step_start, step_middle, step_final) selected based on step number. Final steps get tighter budget and stronger "wrap up" guidance. - Automatic file change tracking: Extracts write_file/edit_file events from tool_events and injects them into the next step's context if the subagent forgot to report them explicitly. - Budget tracking & adaptive strategy: Cumulative token usage is tracked across steps. Per-step tool budget drops from 8 to 4 in the last two steps to force handoff/completion. - Crash retry with graceful degradation: A step that crashes is retried once. Persistent crashes terminate the task and return partial progress. - Full observability hooks for future WebUI integration: - set_hooks() with on_step_start, on_step_complete, on_handoff, on_validation_started, on_validation_passed, on_validation_failed, on_task_complete, on_task_error, and catch-all on_event. - Readable state properties: current_step, total_steps, status, last_handoff, cumulative_usage, goal. - inject_correction() allows external code to send user corrections that are injected into the next step's prompt. - run_step() accepts optional max_iterations for dynamic budget control. All 27 long-task tests and 11 subagent tests pass.
2026-05-20 00:22:31 +00:00 · 2026-05-13 00:55:52 +08:00 · 2026-05-13 00:55:52 +08:00 · 78ecb2a99a
commit 78ecb2a99a
parent e7214d96ed
7 changed files with 992 additions and 169 deletions
--- a/nanobot/agent/subagent.py
+++ b/nanobot/agent/subagent.py
@ -132,6 +132,7 @@ class SubagentManager:
        system_prompt: str,
        user_message: str,
        extra_tools: list["Tool"] | None = None,
        max_iterations: int | None = None,
    ) -> AgentRunResult:
        """Run a single subagent step and return the result directly.
@ -150,7 +151,7 @@ class SubagentManager:
            ],
            tools=tools,
            model=self.model,
-            max_iterations=8,
+            max_iterations=max_iterations if max_iterations is not None else 8,
            max_iterations_message=(
                "Tool budget exhausted. "
                "Call handoff() or complete() earlier next time."
--- a/nanobot/agent/tools/long_task.py
+++ b/nanobot/agent/tools/long_task.py
@ -2,12 +2,20 @@
 from __future__ import annotations
 import time
 from dataclasses import dataclass, field
 from typing import Any, TYPE_CHECKING
 from loguru import logger
 from nanobot.agent.tools.base import Tool, tool_parameters
-from nanobot.agent.tools.schema import StringSchema, IntegerSchema, tool_parameters_schema
+from nanobot.agent.tools.schema import (
    ArraySchema,
    IntegerSchema,
    StringSchema,
    tool_parameters_schema,
 )
 from nanobot.utils.prompt_templates import render_template
 if TYPE_CHECKING:
    from nanobot.agent.subagent import SubagentManager
@ -15,7 +23,33 @@ if TYPE_CHECKING:
 # ---------------------------------------------------------------------------
-# Signal tools -- write progress/completion into a shared dict
+# Structured handoff state
 # ---------------------------------------------------------------------------
@dataclass
 class HandoffState:
    """Structured progress state passed between long-task steps."""
    message: str = ""
    files_created: list[str] = field(default_factory=list)
    files_modified: list[str] = field(default_factory=list)
    next_step_hint: str = ""
    verification: str = ""
    def is_empty(self) -> bool:
        return not any(
            [
                self.message,
                self.files_created,
                self.files_modified,
                self.next_step_hint,
                self.verification,
            ]
        )
 # ---------------------------------------------------------------------------
 # Signal tools -- write progress/completion into a shared state
 # ---------------------------------------------------------------------------
@tool_parameters(
@ -24,13 +58,28 @@ if TYPE_CHECKING:
            "What you completed in this step and where results are saved. "
            "The next step will pick up from here.",
        ),
        files_created=ArraySchema(
            StringSchema(""),
            description="List of file paths you created in this step",
        ),
        files_modified=ArraySchema(
            StringSchema(""),
            description="List of file paths you modified in this step",
        ),
        next_step_hint=StringSchema(
            "A clear, specific hint about what the next step should do. "
            "Be concrete — e.g. 'Implement the test cases in test_foo.py'",
        ),
        verification=StringSchema(
            "Any verification you performed (tests run, lint passed, etc.)",
        ),
        required=["message"],
    )
 )
 class HandoffTool(Tool):
    """Signal that the step is done but the overall task continues."""
-    def __init__(self, store: dict[str, str]) -> None:
+    def __init__(self, store: HandoffState) -> None:
        self._store = store
    @property
@ -40,14 +89,25 @@ class HandoffTool(Tool):
    @property
    def description(self) -> str:
        return (
-            "REQUIRED after finishing your work in this step. "
+            "You are done with this step. Pass control to the next step. "
-            "Pass your progress summary to the next step. "
+            "You MUST call this (or complete()) before your tool budget runs out. "
-            "Use complete() instead if the entire goal is achieved."
+            "Provide a detailed summary, list files changed, and hint the next step."
        )
-    async def execute(self, message: str, **kwargs: Any) -> str:
+    async def execute(
-        self._store["type"] = "handoff"
+        self,
-        self._store["payload"] = message
+        message: str,
        files_created: list[str] | None = None,
        files_modified: list[str] | None = None,
        next_step_hint: str = "",
        verification: str = "",
        **kwargs: Any,
    ) -> str:
        self._store.message = message
        self._store.files_created = list(files_created or [])
        self._store.files_modified = list(files_modified or [])
        self._store.next_step_hint = next_step_hint
        self._store.verification = verification
        return "Progress recorded. The next step will continue from here."
@ -60,7 +120,7 @@ class HandoffTool(Tool):
 class CompleteTool(Tool):
    """Signal that the entire long task is finished."""
-    def __init__(self, store: dict[str, str]) -> None:
+    def __init__(self, store: HandoffState) -> None:
        self._store = store
    @property
@ -70,48 +130,94 @@ class CompleteTool(Tool):
    @property
    def description(self) -> str:
        return (
-            "The ENTIRE goal is achieved. Call this only when nothing remains."
+            "The ENTIRE goal is achieved. Call this only when nothing remains. "
            "Your claim will be validated — if unproven, the task continues."
        )
    async def execute(self, summary: str, **kwargs: Any) -> str:
-        self._store["type"] = "complete"
+        self._store.message = summary
-        self._store["payload"] = summary
+        return "Task marked as complete. Awaiting validation."
        return "Task marked as complete."
 # ---------------------------------------------------------------------------
-# System prompt for long-task subagent steps
+# Budget and prompt helpers
 # ---------------------------------------------------------------------------
 _STEP_BUDGET = 8
 _FINAL_STEP_BUDGET = 4  # Lower budget for final steps
 # Must match max_iterations_message set in SubagentManager.run_step()
 _BUDGET_EXHAUSTED_PREFIX = "Tool budget exhausted"
 _LONG_TASK_SYSTEM_PROMPT = """\
 You are one step in a chain working toward a goal.
-1. Check the filesystem to see what's already done.
+def _step_budget(step: int, max_steps: int) -> int:
-2. Do the next piece of work. Write results to files as you go — \
+    """Compute per-step tool budget based on progress."""
-do NOT just collect information without producing output.
+    if step >= max_steps - 2:
-3. When done with your chunk, call handoff() with a brief summary. \
+        return _FINAL_STEP_BUDGET
-If the entire goal is finished, call complete() instead.
+    return _STEP_BUDGET
 IMPORTANT: Write output to files early and often. If you run out of \
 tool calls, only what's on the filesystem survives.
 """
-def _build_user_message(goal: str, step: int, handoff: str) -> str:
+def _build_system_prompt(budget: int) -> str:
-    """Build the user message for a subagent step with budget warning."""
+    """Build the system prompt for a subagent step."""
    return (
        "You are one step in a chain working toward a goal.\n\n"
        "Rules:\n"
        "1. Do ONE small chunk of work per step.\n"
        "2. Write results to files — do NOT just collect information.\n"
        "3. Call handoff() when done with your chunk. "
        "Call complete() ONLY if the ENTIRE goal is achieved.\n"
        f"4. You have {budget} tool calls. "
        "Reserve the last 1-2 for handoff() or complete()."
    )
 def _build_user_message(
    goal: str,
    step: int,
    max_steps: int,
    handoff: HandoffState,
    correction: str | None = None,
 ) -> str:
    """Build the user message for a subagent step using templates."""
    budget = _step_budget(step, max_steps)
    budget_note = (
        f"\n\n---\n"
-        f"Step {step + 1}. You have {_STEP_BUDGET} tool calls total. "
+        f"Step {step + 1} of {max_steps}. You have {budget} tool calls for this step. "
-        f"Call handoff() or complete() before you run out."
+        f"Reserve the last 1-2 calls for handoff() or complete(). "
        f"If you run out of calls without calling one, your progress is LOST."
    )
    if step == 0:
-        return goal + budget_note
+        prompt = render_template(
-    return f"{goal}\n\n## Previous Progress\n{handoff}{budget_note}"
+            "agent/long_task/step_start.md",
            step=step,
            max_steps=max_steps,
            goal=goal,
            budget=budget,
        )
    elif step >= max_steps - 3:
        prompt = render_template(
            "agent/long_task/step_final.md",
            step=step,
            max_steps=max_steps,
            goal=goal,
            budget=budget,
            handoff=handoff,
        )
    else:
        prompt = render_template(
            "agent/long_task/step_middle.md",
            step=step,
            max_steps=max_steps,
            goal=goal,
            budget=budget,
            handoff=handoff,
        )
    if correction:
        prompt += f"\n\n## User Correction\n{correction}\n"
    return prompt + budget_note
 def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
@ -132,6 +238,42 @@ def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
    return ""
 def _extract_file_changes(
    tool_events: list[dict[str, Any]],
 ) -> tuple[list[str], list[str]]:
    """Extract file creation/modification events from tool events."""
    created: list[str] = []
    modified: list[str] = []
    for event in tool_events:
        name = event.get("name", "")
        status = event.get("status", "")
        detail = event.get("detail", "")
        if status != "ok":
            continue
        if name in ("write_file", "edit_file"):
            # Try to extract file path from detail
            if detail.startswith("Wrote ") or detail.startswith("Edited "):
                path = detail.split(" ", 1)[1].split(":")[0].strip()
                if name == "write_file":
                    created.append(path)
                else:
                    modified.append(path)
    return created, modified
 # ---------------------------------------------------------------------------
 # Observability: events and hooks
 # ---------------------------------------------------------------------------
@dataclass
 class LongTaskEvent:
    """Event emitted during long-task execution for observability."""
    type: str
    payload: dict[str, Any] = field(default_factory=dict)
    timestamp: float = field(default_factory=time.time)
 # ---------------------------------------------------------------------------
 # Long Task Tool — the orchestrator
 # ---------------------------------------------------------------------------
@ -152,14 +294,32 @@ class LongTaskTool(Tool):
    def __init__(self, manager: SubagentManager) -> None:
        self._manager = manager
        self._hooks: dict[str, Any] = {}
        self._reset_state()
-    @classmethod
+    def _reset_state(self) -> None:
-    def enabled(cls, ctx: ToolContext) -> bool:
+        """Reset internal state before a new execution.
        return ctx.subagent_manager is not None
-    @classmethod
+        Preserves any pending user corrections so inject_correction() can be
-    def create(cls, ctx: ToolContext) -> Tool:
+        called before execute() starts.
-        return cls(manager=ctx.subagent_manager)
+        """
        existing_signals = (
            self._state.get("signal_queue", []) if hasattr(self, "_state") else []
        )
        self._state: dict[str, Any] = {
            "current_step": 0,
            "total_steps": 0,
            "goal": "",
            "status": "idle",  # idle, running, validating, completed, error
            "last_handoff": HandoffState(),
            "cumulative_usage": {
                "prompt_tokens": 0,
                "completion_tokens": 0,
                "total_tokens": 0,
            },
            "signal_queue": existing_signals,
            "error": None,
        }
    @property
    def name(self) -> str:
@ -176,61 +336,315 @@ class LongTaskTool(Tool):
            "goal. For simple independent tasks, use spawn instead."
        )
-    async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
+    @classmethod
-        handoff = ""
+    def enabled(cls, ctx: ToolContext) -> bool:
-        logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
+        return ctx.subagent_manager is not None
-        for step in range(max_steps):
+
-            signal_store: dict[str, str] = {}
+    @classmethod
-            user_msg = _build_user_message(goal, step, handoff)
+    def create(cls, ctx: ToolContext) -> Tool:
        return cls(manager=ctx.subagent_manager)
    # --- State exposure for WebUI observability ---
    @property
    def current_step(self) -> int:
        return self._state["current_step"]
    @property
    def total_steps(self) -> int:
        return self._state["total_steps"]
    @property
    def status(self) -> str:
        return self._state["status"]
    @property
    def last_handoff(self) -> HandoffState:
        return self._state["last_handoff"]
    @property
    def cumulative_usage(self) -> dict[str, int]:
        return dict(self._state["cumulative_usage"])
    @property
    def goal(self) -> str:
        return self._state["goal"]
    # --- External signal mechanism (for user correction) ---
    def inject_correction(self, message: str) -> None:
        """Inject a user correction message to be read before the next step."""
        self._state["signal_queue"].append(message)
        logger.info("LongTask correction injected: {}", message[:120])
    def _pop_signal(self) -> str | None:
        """Consume and return the oldest pending correction, if any."""
        if self._state["signal_queue"]:
            return self._state["signal_queue"].pop(0)
        return None
    # --- Hook system for WebUI and logging ---
    def set_hooks(self, hooks: dict[str, Any]) -> None:
        """Register observability hooks.
        Supported hooks (all optional):
        - on_task_start(goal, max_steps)
        - on_step_start(step, goal, budget)
        - on_step_complete(step, result, handoff)
        - on_handoff(step, handoff)
        - on_validation_started(step, completion_summary)
        - on_validation_passed(step, summary)
        - on_validation_failed(step, reason)
        - on_task_complete(step, summary)
        - on_task_error(step, error)
        - on_event(event: LongTaskEvent)  # catch-all
        """
        self._hooks = dict(hooks)
    def _emit(self, event_type: str, **payload: Any) -> None:
        """Emit an event to registered hooks."""
        event = LongTaskEvent(type=event_type, payload=payload)
        logger.debug("LongTask event: {} | {}", event_type, payload)
        # Call catch-all hook
        catch_all = self._hooks.get("on_event")
        if catch_all is not None:
            try:
-                result = await self._manager.run_step(
+                catch_all(event)
                    system_prompt=_LONG_TASK_SYSTEM_PROMPT,
                    user_message=user_msg,
                    extra_tools=[HandoffTool(signal_store), CompleteTool(signal_store)],
                )
            except Exception:
-                logger.exception("long_task step {}/{} failed", step + 1, max_steps)
+                logger.exception("LongTask on_event hook failed")
-                if handoff:
+
        # Call specific hook
        hook_name = f"on_{event_type}"
        hook = self._hooks.get(hook_name)
        if hook is not None:
            try:
                hook(**payload)
            except Exception:
                logger.exception("LongTask {} hook failed", hook_name)
    # --- Core execution ---
    async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
        handoff = HandoffState()
        self._reset_state()
        self._state["goal"] = goal
        self._state["total_steps"] = max_steps
        self._state["status"] = "running"
        logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
        self._emit("task_start", goal=goal, max_steps=max_steps)
        for step in range(max_steps):
            self._state["current_step"] = step
            signal_store = HandoffState()
            correction = self._pop_signal()
            user_msg = _build_user_message(
                goal, step, max_steps, handoff, correction=correction
            )
            budget = _step_budget(step, max_steps)
            self._emit("step_start", step=step, goal=goal, budget=budget)
            # Run the step with retry on crash
            result = await self._run_step_with_retry(
                system_prompt=_build_system_prompt(budget),
                user_message=user_msg,
                extra_tools=[HandoffTool(signal_store), CompleteTool(signal_store)],
                step=step,
                budget=budget,
            )
            if result is None:
                # Fatal error after retry
                self._state["status"] = "error"
                self._emit("task_error", step=step, error=self._state["error"])
                if handoff.message:
                    return (
                        f"Long task failed at step {step + 1}/{max_steps}. "
-                        f"Last progress:\n{handoff}"
+                        f"Last progress:\n{handoff.message}"
                    )
                return f"Long task failed at step {step + 1}/{max_steps}."
-            sig_type = signal_store.get("type")
+
-            sig_payload = signal_store.get("payload", "")
+            # Accumulate usage
            usage = getattr(result, "usage", {}) or {}
            for key in ("prompt_tokens", "completion_tokens", "total_tokens"):
                self._state["cumulative_usage"][key] += usage.get(key, 0)
            # Extract file changes from tool events for automatic tracking
            tool_events = getattr(result, "tool_events", []) or []
            auto_created, auto_modified = _extract_file_changes(tool_events)
            if auto_created or auto_modified:
                logger.debug(
                    "long_task step {}: auto-detected files created={}, modified={}",
                    step + 1,
                    auto_created,
                    auto_modified,
                )
            self._emit("step_complete", step=step, result=result, handoff=signal_store)
            # Determine signal from tool events
            sig_type = "none"
            for event in tool_events:
                ev_name = event.get("name", "")
                if ev_name == "complete":
                    sig_type = "complete"
                    break
                elif ev_name == "handoff":
                    sig_type = "handoff"
                    break
            # Fallback: if no explicit signal but CompleteTool/HandoffTool was
            # called without arguments (message empty), use final_content
            if sig_type == "none" and signal_store.message:
                # Tool was called but we couldn't detect from events;
                # use the store content as handoff
                sig_type = "handoff"
            elif sig_type == "none":
                signal_store.message = _extract_handoff_from_messages(
                    getattr(result, "messages", []) or []
                )
                if signal_store.message:
                    sig_type = "handoff"
            sig_payload = signal_store.message
            logger.info(
                "long_task step {}/{}: signal={}, stop_reason={}, tools={}",
-                step + 1, max_steps, sig_type or "auto",
+                step + 1,
                max_steps,
                sig_type,
                result.stop_reason,
                result.tools_used,
            )
            if sig_type == "complete":
                logger.debug(
                    "long_task done at step {}: complete payload={:.200}",
                    step + 1, sig_payload,
                )
                return sig_payload
-            # Auto-extract progress — don't require handoff()
+            if sig_type == "complete":
-            if sig_type == "handoff":
+                # Validation round
-                handoff = sig_payload
+                self._state["status"] = "validating"
-                logger.debug("long_task step {} handoff: {:.200}", step + 1, handoff)
+                self._emit(
-            elif result.stop_reason == "completed":
+                    "validation_started",
-                # Subagent returned text naturally (no more tool calls)
+                    step=step,
-                handoff = result.final_content or ""
+                    completion_summary=sig_payload,
                logger.debug(
                    "long_task step {} natural end: {:.200}",
                    step + 1, handoff[:200] if handoff else "(empty)",
                )
                validated = await self._validate_completion(
                    goal, sig_payload, max_steps
                )
                if validated:
                    self._state["status"] = "completed"
                    self._emit("task_complete", step=step, summary=sig_payload)
                    return sig_payload
                else:
                    self._emit(
                        "validation_failed",
                        step=step,
                        reason="Validation did not confirm completion",
                    )
                    # Fall through to handoff — continue working
                    handoff = signal_store
                    handoff.next_step_hint = (
                        f"Validation failed. Continue working toward the goal. "
                        f"Previous claim: {sig_payload}"
                    )
                    self._state["last_handoff"] = handoff
                    continue
            elif sig_type == "handoff":
                self._emit("handoff_received", step=step, handoff=signal_store)
                # Merge auto-detected file changes if not explicitly reported
                if auto_created and not signal_store.files_created:
                    signal_store.files_created = auto_created
                if auto_modified and not signal_store.files_modified:
                    signal_store.files_modified = auto_modified
                handoff = signal_store
                self._state["last_handoff"] = handoff
                continue
            else:
-                # max_iterations hit — extract whatever text the subagent produced
+                # No signal — use extracted content as handoff
-                handoff = _extract_handoff_from_messages(result.messages)
+                handoff = HandoffState(message=signal_store.message)
-                logger.debug(
+                self._state["last_handoff"] = handoff
-                    "long_task step {} auto-extract: {:.200}",
+
-                    step + 1, handoff[:200] if handoff else "(empty)",
+        self._state["status"] = "error"
-                )
+        self._emit("task_error", step=max_steps, error="Max steps reached")
        logger.warning("long_task exhausted max_steps={}", max_steps)
        return (
            f"Long task reached max steps ({max_steps}). "
-            f"Last progress:\n{handoff}"
+            f"Last progress:\n{handoff.message}"
        )
    async def _run_step_with_retry(
        self,
        system_prompt: str,
        user_message: str,
        extra_tools: list[Any],
        step: int,
        budget: int,
    ) -> Any:
        """Run a single step with one retry on crash."""
        try:
            return await self._manager.run_step(
                system_prompt=system_prompt,
                user_message=user_message,
                extra_tools=extra_tools,
                max_iterations=budget,
            )
        except Exception as first_err:
            logger.warning(
                "long_task step {}/{} crashed (will retry once): {}",
                step + 1,
                self._state["total_steps"],
                first_err,
            )
            try:
                return await self._manager.run_step(
                    system_prompt=system_prompt,
                    user_message=user_message,
                    extra_tools=extra_tools,
                    max_iterations=budget,
                )
            except Exception as second_err:
                logger.exception(
                    "long_task step {}/{} failed after retry",
                    step + 1,
                    self._state["total_steps"],
                )
                self._state["error"] = str(second_err)
                return None
    async def _validate_completion(
        self, goal: str, completion_summary: str, max_steps: int
    ) -> bool:
        """Run a validation step to verify the completion claim."""
        try:
            validation_store = HandoffState()
            validation_prompt = render_template(
                "agent/long_task/validation.md",
                goal=goal,
                completion_summary=completion_summary,
            )
            result = await self._manager.run_step(
                system_prompt=validation_prompt,
                user_message="Validate the claimed completion. "
                "Call complete() if verified, handoff() if not.",
                extra_tools=[
                    HandoffTool(validation_store),
                    CompleteTool(validation_store),
                ],
                max_iterations=4,  # Short validation step
            )
            # If complete() was called, validation passed
            tool_events = getattr(result, "tool_events", []) or []
            for event in tool_events:
                if event.get("name") == "complete":
                    self._emit("validation_passed", summary=completion_summary)
                    return True
            self._emit(
                "validation_failed",
                reason=validation_store.message or "Validator did not confirm",
            )
            return False
        except Exception:
            logger.exception("Validation step failed")
            return False
--- a/nanobot/templates/agent/long_task/step_final.md
+++ b/nanobot/templates/agent/long_task/step_final.md
@ -0,0 +1,28 @@
 # Long Task — FINAL Step {{ step + 1 }}/{{ max_steps }}
 **This is one of the LAST steps. You are running out of budget.**
 ## Goal
 {{ goal }}
 ## Previous Progress
 {% if handoff.message %}
 {{ handoff.message }}
 {% endif %}
 {% if handoff.files_created or handoff.files_modified %}
 ### Files Changed
 {% for f in handoff.files_created %}
 - Created: `{{ f }}`
 {% endfor %}
 {% for f in handoff.files_modified %}
 - Modified: `{{ f }}`
 {% endfor %}
 {% endif %}
 ## Instructions
 1. **Do NOT start new work**. Only finish what is already in progress.
 2. **Wrap up**: Complete any partial work, write final results to files.
 3. **Final handoff**: Call `handoff()` with a clear summary of what remains unfinished. Call `complete()` ONLY if you are 100% sure everything is done.
 You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.
--- a/nanobot/templates/agent/long_task/step_middle.md
+++ b/nanobot/templates/agent/long_task/step_middle.md
@ -0,0 +1,38 @@
 # Long Task — Step {{ step + 1 }}/{{ max_steps }}
 You are one step in a chain working toward a goal.
 ## Goal
 {{ goal }}
 ## Previous Progress
 {% if handoff.message %}
 {{ handoff.message }}
 {% endif %}
 {% if handoff.files_created or handoff.files_modified %}
 ### Files Changed
 {% for f in handoff.files_created %}
 - Created: `{{ f }}`
 {% endfor %}
 {% for f in handoff.files_modified %}
 - Modified: `{{ f }}`
 {% endfor %}
 {% endif %}
 {% if handoff.next_step_hint %}
 ### Suggested Next Step
 {{ handoff.next_step_hint }}
 {% endif %}
 {% if handoff.verification %}
 ### Verification
 {{ handoff.verification }}
 {% endif %}
 ## Instructions
 1. **Check existing work**: Use the file list above — do NOT re-explore files already handled.
 2. **Do the next chunk**: Make concrete progress. Write results to files.
 3. **Handoff**: Call `handoff()` with your progress summary, files changed, and a hint for the next step. Call `complete()` only if the ENTIRE goal is achieved.
 You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.
--- a/nanobot/templates/agent/long_task/step_start.md
+++ b/nanobot/templates/agent/long_task/step_start.md
@ -0,0 +1,14 @@
 # Long Task — Step {{ step + 1 }}/{{ max_steps }}
 You are the FIRST step in a chain working toward a goal.
 ## Goal
 {{ goal }}
 ## Instructions
 1. **Explore first**: Check the filesystem to understand the current state. Do NOT assume anything.
 2. **Plan your work**: Decide what chunk you will do in this step.
 3. **Do the work**: Make concrete progress. Write results to files — do NOT just collect information without producing output.
 4. **Handoff**: When done, call `handoff()` with a detailed summary. If the ENTIRE goal is already achieved, call `complete()` instead.
 You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.
--- a/nanobot/templates/agent/long_task/validation.md
+++ b/nanobot/templates/agent/long_task/validation.md
@ -0,0 +1,16 @@
 # Validation — Confirm Goal Completion
 ## Original Goal
 {{ goal }}
 ## Claimed Completion
 {{ completion_summary }}
 ## Instructions
 You are a strict validator. Review the claimed completion against the original goal.
 1. Check every requirement in the goal. Is each one actually satisfied by evidence (files created, tests passing, etc.)?
 2. If ANY requirement is unproven or incomplete, call `handoff()` with what is missing.
 3. If ALL requirements are satisfied with evidence, call `complete()` confirming the validation.
 Be skeptical. "Looks correct" is not enough — verify against the filesystem.
--- a/tests/agent/tools/test_long_task.py
+++ b/tests/agent/tools/test_long_task.py
@ -2,53 +2,82 @@
 import pytest
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock
 from nanobot.agent.tools.long_task import (
    HandoffState,
    HandoffTool,
    CompleteTool,
    LongTaskTool,
    LongTaskEvent,
    _build_system_prompt,
    _build_user_message,
    _extract_file_changes,
    _extract_handoff_from_messages,
 )
 # ---------------------------------------------------------------------------
 # Signal tool tests
 # ---------------------------------------------------------------------------
@pytest.mark.asyncio
-async def test_handoff_tool_stores_signal():
+async def test_handoff_tool_stores_structured_signal():
-    from nanobot.agent.tools.long_task import HandoffTool
+    store = HandoffState()
    store: dict[str, str] = {}
    tool = HandoffTool(store)
-    result = await tool.execute(message="Processed items 1-8. Results in out.md. Continue with item 9.")
+    result = await tool.execute(
        message="Processed items 1-8. Results in out.md.",
        files_created=["out.md", "report.md"],
        files_modified=["main.py"],
        next_step_hint="Continue with item 9.",
        verification="Tests passed",
    )
    assert result == "Progress recorded. The next step will continue from here."
-    assert store["type"] == "handoff"
+    assert store.message == "Processed items 1-8. Results in out.md."
-    assert store["payload"] == "Processed items 1-8. Results in out.md. Continue with item 9."
+    assert store.files_created == ["out.md", "report.md"]
    assert store.files_modified == ["main.py"]
    assert store.next_step_hint == "Continue with item 9."
    assert store.verification == "Tests passed"
@pytest.mark.asyncio
 async def test_handoff_tool_defaults_optional_fields():
    store = HandoffState()
    tool = HandoffTool(store)
    await tool.execute(message="Done.")
    assert store.files_created == []
    assert store.files_modified == []
    assert store.next_step_hint == ""
    assert store.verification == ""
@pytest.mark.asyncio
 async def test_complete_tool_stores_signal():
-    from nanobot.agent.tools.long_task import CompleteTool
+    store = HandoffState()
    store: dict[str, str] = {}
    tool = CompleteTool(store)
    result = await tool.execute(summary="All 100 items processed. Summary in report.md")
-    assert result == "Task marked as complete."
+    assert result == "Task marked as complete. Awaiting validation."
-    assert store["type"] == "complete"
+    assert store.message == "All 100 items processed. Summary in report.md"
    assert store["payload"] == "All 100 items processed. Summary in report.md"
@pytest.mark.asyncio
 async def test_signal_tools_overwrite_on_multiple_calls():
-    """Last call wins -- the orchestrator only reads the final signal."""
+    """Last call wins — the orchestrator only reads the final signal."""
-    from nanobot.agent.tools.long_task import HandoffTool, CompleteTool
+    store = HandoffState()
    store: dict[str, str] = {}
    handoff = HandoffTool(store)
    complete = CompleteTool(store)
    await handoff.execute(message="first progress")
-    assert store["type"] == "handoff"
+    assert store.message == "first progress"
    await complete.execute(summary="done early")
-    assert store["type"] == "complete"
+    assert store.message == "done early"
    assert store["payload"] == "done early"
 # ---------------------------------------------------------------------------
 # Helper: minimal SubagentManager stub
 # ---------------------------------------------------------------------------
 def _make_manager_stub():
    """Create a minimal SubagentManager stub with a mockable run_step."""
    mgr = MagicMock()
@ -64,6 +93,7 @@ def _step_result(**overrides):
        tool_events=[],
        stop_reason="completed",
        tools_used=[],
        usage={},
    )
    defaults.update(overrides)
    return SimpleNamespace(**defaults)
@ -76,68 +106,150 @@ def _step_result(**overrides):
@pytest.mark.asyncio
 async def test_long_task_completes_in_one_step():
-    """Subagent calls complete() immediately."""
+    """Subagent calls complete() immediately, validation passes."""
    from nanobot.agent.tools.long_task import LongTaskTool
    mgr = _make_manager_stub()
    call_count = 0
-    async def fake_run_step(*, system_prompt, user_message, extra_tools):
+    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
-        for t in extra_tools:
+        nonlocal call_count
-            if t.name == "complete":
+        call_count += 1
-                await t.execute(summary="All done. Report in summary.md")
+        if call_count == 1:
-        return _step_result(
+            for t in extra_tools:
-            final_content="All done.",
+                if t.name == "complete":
-            tools_used=["complete"],
+                    await t.execute(summary="All done. Report in summary.md")
-        )
+            return _step_result(
                final_content="All done.",
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
        else:
            # Validation round
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="Validated")
            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    result = await tool.execute(goal="Audit all issues.")
    assert result == "All done. Report in summary.md"
    assert call_count == 2  # main step + validation
@pytest.mark.asyncio
 async def test_long_task_completes_after_multiple_handoffs():
-    """Subagent calls handoff() twice then complete()."""
+    """Subagent calls handoff() twice then complete(), validation passes."""
    from nanobot.agent.tools.long_task import LongTaskTool
    mgr = _make_manager_stub()
    call_count = 0
-    async def fake_run_step(*, system_prompt, user_message, extra_tools):
+    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        nonlocal call_count
        call_count += 1
        if call_count == 1:
            for t in extra_tools:
                if t.name == "handoff":
                    await t.execute(message="Processed 1-8.")
            return _step_result(
                tools_used=["handoff"],
                tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
            )
        elif call_count == 2:
            assert "Processed 1-8." in user_message
-            assert "8 tool calls" in user_message
+            assert "Step 2" in user_message or "Step 2 of" in user_message
            for t in extra_tools:
                if t.name == "handoff":
                    await t.execute(message="Processed 9-16.")
-        else:
+            return _step_result(
                tools_used=["handoff"],
                tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
            )
        elif call_count == 3:
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="All 16 items audited.")
-        return _step_result(tools_used=["handoff"])
+            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
        else:
            # Validation round
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="Validated")
            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    result = await tool.execute(goal="Audit 16 issues.")
    assert result == "All 16 items audited."
-    assert call_count == 3
+    assert call_count == 4  # 3 main steps + validation
@pytest.mark.asyncio
 async def test_long_task_validation_falls_back_to_handoff():
    """Subagent claims complete but validation fails — task continues."""
    mgr = _make_manager_stub()
    call_count = 0
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        nonlocal call_count
        call_count += 1
        if call_count == 1:
            # First step claims complete
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="Done.")
            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
        elif call_count == 2:
            # Validation round fails (handoff called)
            for t in extra_tools:
                if t.name == "handoff":
                    await t.execute(message="Not actually done. Need more work.")
            return _step_result(
                tools_used=["handoff"],
                tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
            )
        elif call_count == 3:
            # Continue and complete for real
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="Really done.")
            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
        else:
            # Second validation passes
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="Validated")
            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    result = await tool.execute(goal="Do something.", max_steps=5)
    assert "Really done." == result
    assert call_count == 4
@pytest.mark.asyncio
 async def test_long_task_fallback_when_no_signal_called():
    """Subagent doesn't call handoff/complete — extract progress from messages."""
    from nanobot.agent.tools.long_task import LongTaskTool
    mgr = _make_manager_stub()
-    async def fake_run_step(*, system_prompt, user_message, extra_tools):
+    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        return _step_result(
            final_content="Tool budget exhausted.",
            messages=[
@ -161,57 +273,227 @@ async def test_long_task_fallback_when_no_signal_called():
@pytest.mark.asyncio
 async def test_long_task_auto_extracts_on_natural_end():
    """Subagent finishes naturally (stop_reason=completed) without calling signal."""
    from nanobot.agent.tools.long_task import LongTaskTool
    mgr = _make_manager_stub()
-    steps = 0
+    call_count = 0
-    async def fake_run_step(*, system_prompt, user_message, extra_tools):
+    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
-        nonlocal steps
+        nonlocal call_count
-        steps += 1
+        call_count += 1
-        if steps == 1:
+        if call_count == 1:
            return _step_result(
                final_content="I processed items 1-5. Results in out.md.",
                stop_reason="completed",
            )
-        # Second step: subagent calls complete
+        elif call_count == 2:
-        for t in extra_tools:
+            for t in extra_tools:
-            if t.name == "complete":
+                if t.name == "complete":
-                await t.execute(summary="All done.")
+                    await t.execute(summary="All done.")
-        return _step_result(
+            return _step_result(
-            final_content="All done.",
+                final_content="All done.",
-            tools_used=["complete"],
+                tools_used=["complete"],
-        )
+                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
        else:
            # Validation
            for t in extra_tools:
                if t.name == "complete":
                    await t.execute(summary="Validated")
            return _step_result(
                tools_used=["complete"],
                tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
            )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    result = await tool.execute(goal="Process items.", max_steps=5)
    assert "All done." == result
-    assert steps == 2
+    assert call_count == 3
@pytest.mark.asyncio
-async def test_long_task_goal_appears_in_system_prompt():
+async def test_long_task_retries_on_crash():
-    """Verify every step's system_prompt contains the long task system prompt."""
+    """A step that crashes once should be retried."""
    from nanobot.agent.tools.long_task import LongTaskTool
    mgr = _make_manager_stub()
-    captured_prompts = []
+    call_count = 0
-    async def fake_run_step(*, system_prompt, user_message, extra_tools):
+    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
-        captured_prompts.append(system_prompt)
+        nonlocal call_count
        call_count += 1
        if call_count == 1:
            raise RuntimeError("Simulated crash")
        for t in extra_tools:
            if t.name == "complete":
-                await t.execute(summary="done")
+                await t.execute(summary="Recovered.")
-        return _step_result(final_content="done")
+        return _step_result(
            tools_used=["complete"],
            tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
-    await tool.execute(goal="Audit everything.")
+    result = await tool.execute(goal="Test retry.")
-    assert len(captured_prompts) == 1
+    assert "Recovered." == result
-    assert "handoff()" in captured_prompts[0]
+    assert call_count == 3  # main step + retry + validation
-    assert "complete()" in captured_prompts[0]
+
-    assert "filesystem" in captured_prompts[0]
+
@pytest.mark.asyncio
 async def test_long_task_fails_after_two_crashes():
    """A step that crashes twice should terminate the task."""
    mgr = _make_manager_stub()
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        raise RuntimeError("Persistent crash")
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    result = await tool.execute(goal="Test failure.", max_steps=3)
    assert "failed at step 1/3" in result
    assert tool.status == "error"
@pytest.mark.asyncio
 async def test_long_task_uses_dynamic_budget():
    """Final steps should use lower max_iterations."""
    mgr = _make_manager_stub()
    captured_budgets = []
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        captured_budgets.append(max_iterations)
        for t in extra_tools:
            if t.name == "complete":
                await t.execute(summary="Done.")
        return _step_result(
            tools_used=["complete"],
            tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    await tool.execute(goal="Test budget.", max_steps=5)
    # Step 0-2 should use 8, step 3+ should use 4
    # But we complete on step 0, so only one budget captured
    assert captured_budgets[0] == 8
 # ---------------------------------------------------------------------------
 # Hook and observability tests
 # ---------------------------------------------------------------------------
@pytest.mark.asyncio
 async def test_hooks_receive_events():
    """Registered hooks should be called during execution."""
    mgr = _make_manager_stub()
    events = []
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        for t in extra_tools:
            if t.name == "complete":
                await t.execute(summary="Done.")
        return _step_result(
            tools_used=["complete"],
            tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    tool.set_hooks({
        "on_task_start": lambda **kw: events.append(("task_start", kw)),
        "on_step_start": lambda **kw: events.append(("step_start", kw)),
        "on_step_complete": lambda **kw: events.append(("step_complete", kw)),
        "on_validation_started": lambda **kw: events.append(("validation_started", kw)),
        "on_task_complete": lambda **kw: events.append(("task_complete", kw)),
    })
    await tool.execute(goal="Test hooks.")
    assert any(e[0] == "task_start" for e in events)
    assert any(e[0] == "step_start" for e in events)
    assert any(e[0] == "step_complete" for e in events)
    assert any(e[0] == "validation_started" for e in events)
    assert any(e[0] == "task_complete" for e in events)
@pytest.mark.asyncio
 async def test_catch_all_hook_receives_events():
    """The on_event catch-all hook should receive all events."""
    mgr = _make_manager_stub()
    events = []
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        for t in extra_tools:
            if t.name == "complete":
                await t.execute(summary="Done.")
        return _step_result(
            tools_used=["complete"],
            tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    tool.set_hooks({
        "on_event": lambda ev: events.append(ev.type),
    })
    await tool.execute(goal="Test catch-all.")
    assert "task_start" in events
    assert "step_start" in events
    assert "task_complete" in events
@pytest.mark.asyncio
 async def test_state_exposure():
    """Properties should reflect current execution state."""
    mgr = _make_manager_stub()
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        for t in extra_tools:
            if t.name == "handoff":
                await t.execute(message="Progress.")
        return _step_result(
            tools_used=["handoff"],
            tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    assert tool.status == "idle"
    # Start execution in background so we can inspect mid-run
    import asyncio
    task = asyncio.create_task(tool.execute(goal="Test state.", max_steps=3))
    # Give it a moment to start
    await asyncio.sleep(0.01)
    # Task should have finished by now since mocks are instant
    await task
    assert tool.goal == "Test state."
    assert tool.total_steps == 3
    assert tool.status == "error"  # max_steps reached
    assert tool.last_handoff.message == "Progress."
@pytest.mark.asyncio
 async def test_inject_correction():
    """User correction should appear in the next step's user message."""
    mgr = _make_manager_stub()
    captured_messages = []
    async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
        captured_messages.append(user_message)
        for t in extra_tools:
            if t.name == "complete":
                await t.execute(summary="Done.")
        return _step_result(
            tools_used=["complete"],
            tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
        )
    mgr.run_step.side_effect = fake_run_step
    tool = LongTaskTool(manager=mgr)
    tool.inject_correction("Focus on error handling.")
    await tool.execute(goal="Refactor code.")
    assert any("Focus on error handling" in msg for msg in captured_messages)
 # ---------------------------------------------------------------------------
@ -219,30 +501,48 @@ async def test_long_task_goal_appears_in_system_prompt():
 # ---------------------------------------------------------------------------
-def test_build_user_message_step_0():
+def test_build_system_prompt():
-    from nanobot.agent.tools.long_task import _build_user_message
+    prompt = _build_system_prompt(budget=8)
    assert "handoff()" in prompt
    assert "complete()" in prompt
    assert "8 tool calls" in prompt
-    msg = _build_user_message("Audit all issues.", step=0, handoff="")
+
-    assert msg.startswith("Audit all issues.")
+def test_build_user_message_step_0():
-    assert "Step 1" in msg
+    msg = _build_user_message("Audit all issues.", step=0, max_steps=20, handoff=HandoffState())
    assert "Audit all issues." in msg
    assert "Step 1 of 20" in msg
    assert "8 tool calls" in msg
    assert "Previous Progress" not in msg
 def test_build_user_message_later_step():
-    from nanobot.agent.tools.long_task import _build_user_message
+    handoff = HandoffState(message="Did 1-10.", files_created=["a.py"], next_step_hint="Do Y")
-
+    msg = _build_user_message("Audit all issues.", step=3, max_steps=20, handoff=handoff)
    msg = _build_user_message("Audit all issues.", step=3, handoff="Did 1-10.")
    assert "Audit all issues." in msg
    assert "Previous Progress" in msg
    assert "Did 1-10." in msg
-    assert "Step 4" in msg
+    assert "a.py" in msg
-    assert "8 tool calls" in msg
+    assert "Do Y" in msg
    assert "Step 4 of 20" in msg
 def test_build_user_message_final_step():
    handoff = HandoffState(message="Almost done.")
    msg = _build_user_message("Audit all issues.", step=18, max_steps=20, handoff=handoff)
    assert "FINAL Step" in msg
    assert "4 tool calls" in msg  # final steps use lower budget
 def test_build_user_message_with_correction():
    msg = _build_user_message(
        "Audit.", step=0, max_steps=20, handoff=HandoffState(), correction="Skip file A"
    )
    assert "User Correction" in msg
    assert "Skip file A" in msg
 def test_extract_handoff_from_messages():
    from nanobot.agent.tools.long_task import _extract_handoff_from_messages
    messages = [
        {"role": "system", "content": "sys"},
        {"role": "user", "content": "do it"},
@ -254,8 +554,6 @@ def test_extract_handoff_from_messages():
 def test_extract_handoff_skips_budget_message():
    from nanobot.agent.tools.long_task import _extract_handoff_from_messages
    messages = [
        {"role": "system", "content": "sys"},
        {"role": "user", "content": "do it"},
@ -268,12 +566,26 @@ def test_extract_handoff_skips_budget_message():
 def test_extract_handoff_from_empty_messages():
    from nanobot.agent.tools.long_task import _extract_handoff_from_messages
    assert _extract_handoff_from_messages([]) == ""
    assert _extract_handoff_from_messages([{"role": "system", "content": "sys"}]) == ""
 def test_extract_file_changes_from_tool_events():
    events = [
        {"name": "write_file", "status": "ok", "detail": "Wrote /workspace/a.py: done"},
        {"name": "edit_file", "status": "ok", "detail": "Edited /workspace/b.py: patched"},
        {"name": "read_file", "status": "ok", "detail": "Read /workspace/c.py"},
        {"name": "write_file", "status": "error", "detail": "Failed"},
    ]
    created, modified = _extract_file_changes(events)
    assert created == ["/workspace/a.py"]
    assert modified == ["/workspace/b.py"]
 def test_extract_file_changes_empty():
    assert _extract_file_changes([]) == ([], [])
 # ---------------------------------------------------------------------------
 # Integration: verify LongTaskTool is wired into the main agent loop
 # ---------------------------------------------------------------------------