feat(long-task): major overhaul with structured handoffs, validation, and observability

- Structured HandoffState: HandoffTool now accepts files_created,
  files_modified, next_step_hint, and verification fields instead of
  a plain string. Progress is passed between steps as structured data.

- Completion validation round: After complete() is called, a dedicated
  validator step runs to verify the claim against the original goal.
  If validation fails, the task continues rather than returning
  a false completion.

- Dynamic prompt system: 3 Jinja2 templates (step_start, step_middle,
  step_final) selected based on step number. Final steps get tighter
  budget and stronger "wrap up" guidance.

- Automatic file change tracking: Extracts write_file/edit_file events
  from tool_events and injects them into the next step's context if
  the subagent forgot to report them explicitly.

- Budget tracking & adaptive strategy: Cumulative token usage is tracked
  across steps. Per-step tool budget drops from 8 to 4 in the last
  two steps to force handoff/completion.

- Crash retry with graceful degradation: A step that crashes is retried
  once. Persistent crashes terminate the task and return partial progress.

- Full observability hooks for future WebUI integration:
  - set_hooks() with on_step_start, on_step_complete, on_handoff,
    on_validation_started, on_validation_passed, on_validation_failed,
    on_task_complete, on_task_error, and catch-all on_event.
  - Readable state properties: current_step, total_steps, status,
    last_handoff, cumulative_usage, goal.
  - inject_correction() allows external code to send user corrections
    that are injected into the next step's prompt.

- run_step() accepts optional max_iterations for dynamic budget control.

All 27 long-task tests and 11 subagent tests pass.
This commit is contained in:
chengyongru 2026-05-13 00:55:52 +08:00
parent e7214d96ed
commit 78ecb2a99a
7 changed files with 992 additions and 169 deletions

View File

@ -132,6 +132,7 @@ class SubagentManager:
system_prompt: str,
user_message: str,
extra_tools: list["Tool"] | None = None,
max_iterations: int | None = None,
) -> AgentRunResult:
"""Run a single subagent step and return the result directly.
@ -150,7 +151,7 @@ class SubagentManager:
],
tools=tools,
model=self.model,
max_iterations=8,
max_iterations=max_iterations if max_iterations is not None else 8,
max_iterations_message=(
"Tool budget exhausted. "
"Call handoff() or complete() earlier next time."

View File

@ -2,12 +2,20 @@
from __future__ import annotations
import time
from dataclasses import dataclass, field
from typing import Any, TYPE_CHECKING
from loguru import logger
from nanobot.agent.tools.base import Tool, tool_parameters
from nanobot.agent.tools.schema import StringSchema, IntegerSchema, tool_parameters_schema
from nanobot.agent.tools.schema import (
ArraySchema,
IntegerSchema,
StringSchema,
tool_parameters_schema,
)
from nanobot.utils.prompt_templates import render_template
if TYPE_CHECKING:
from nanobot.agent.subagent import SubagentManager
@ -15,7 +23,33 @@ if TYPE_CHECKING:
# ---------------------------------------------------------------------------
# Signal tools -- write progress/completion into a shared dict
# Structured handoff state
# ---------------------------------------------------------------------------
@dataclass
class HandoffState:
"""Structured progress state passed between long-task steps."""
message: str = ""
files_created: list[str] = field(default_factory=list)
files_modified: list[str] = field(default_factory=list)
next_step_hint: str = ""
verification: str = ""
def is_empty(self) -> bool:
return not any(
[
self.message,
self.files_created,
self.files_modified,
self.next_step_hint,
self.verification,
]
)
# ---------------------------------------------------------------------------
# Signal tools -- write progress/completion into a shared state
# ---------------------------------------------------------------------------
@tool_parameters(
@ -24,13 +58,28 @@ if TYPE_CHECKING:
"What you completed in this step and where results are saved. "
"The next step will pick up from here.",
),
files_created=ArraySchema(
StringSchema(""),
description="List of file paths you created in this step",
),
files_modified=ArraySchema(
StringSchema(""),
description="List of file paths you modified in this step",
),
next_step_hint=StringSchema(
"A clear, specific hint about what the next step should do. "
"Be concrete — e.g. 'Implement the test cases in test_foo.py'",
),
verification=StringSchema(
"Any verification you performed (tests run, lint passed, etc.)",
),
required=["message"],
)
)
class HandoffTool(Tool):
"""Signal that the step is done but the overall task continues."""
def __init__(self, store: dict[str, str]) -> None:
def __init__(self, store: HandoffState) -> None:
self._store = store
@property
@ -40,14 +89,25 @@ class HandoffTool(Tool):
@property
def description(self) -> str:
return (
"REQUIRED after finishing your work in this step. "
"Pass your progress summary to the next step. "
"Use complete() instead if the entire goal is achieved."
"You are done with this step. Pass control to the next step. "
"You MUST call this (or complete()) before your tool budget runs out. "
"Provide a detailed summary, list files changed, and hint the next step."
)
async def execute(self, message: str, **kwargs: Any) -> str:
self._store["type"] = "handoff"
self._store["payload"] = message
async def execute(
self,
message: str,
files_created: list[str] | None = None,
files_modified: list[str] | None = None,
next_step_hint: str = "",
verification: str = "",
**kwargs: Any,
) -> str:
self._store.message = message
self._store.files_created = list(files_created or [])
self._store.files_modified = list(files_modified or [])
self._store.next_step_hint = next_step_hint
self._store.verification = verification
return "Progress recorded. The next step will continue from here."
@ -60,7 +120,7 @@ class HandoffTool(Tool):
class CompleteTool(Tool):
"""Signal that the entire long task is finished."""
def __init__(self, store: dict[str, str]) -> None:
def __init__(self, store: HandoffState) -> None:
self._store = store
@property
@ -70,48 +130,94 @@ class CompleteTool(Tool):
@property
def description(self) -> str:
return (
"The ENTIRE goal is achieved. Call this only when nothing remains."
"The ENTIRE goal is achieved. Call this only when nothing remains. "
"Your claim will be validated — if unproven, the task continues."
)
async def execute(self, summary: str, **kwargs: Any) -> str:
self._store["type"] = "complete"
self._store["payload"] = summary
return "Task marked as complete."
self._store.message = summary
return "Task marked as complete. Awaiting validation."
# ---------------------------------------------------------------------------
# System prompt for long-task subagent steps
# Budget and prompt helpers
# ---------------------------------------------------------------------------
_STEP_BUDGET = 8
_FINAL_STEP_BUDGET = 4 # Lower budget for final steps
# Must match max_iterations_message set in SubagentManager.run_step()
_BUDGET_EXHAUSTED_PREFIX = "Tool budget exhausted"
_LONG_TASK_SYSTEM_PROMPT = """\
You are one step in a chain working toward a goal.
1. Check the filesystem to see what's already done.
2. Do the next piece of work. Write results to files as you go \
do NOT just collect information without producing output.
3. When done with your chunk, call handoff() with a brief summary. \
If the entire goal is finished, call complete() instead.
IMPORTANT: Write output to files early and often. If you run out of \
tool calls, only what's on the filesystem survives.
"""
def _step_budget(step: int, max_steps: int) -> int:
"""Compute per-step tool budget based on progress."""
if step >= max_steps - 2:
return _FINAL_STEP_BUDGET
return _STEP_BUDGET
def _build_user_message(goal: str, step: int, handoff: str) -> str:
"""Build the user message for a subagent step with budget warning."""
def _build_system_prompt(budget: int) -> str:
"""Build the system prompt for a subagent step."""
return (
"You are one step in a chain working toward a goal.\n\n"
"Rules:\n"
"1. Do ONE small chunk of work per step.\n"
"2. Write results to files — do NOT just collect information.\n"
"3. Call handoff() when done with your chunk. "
"Call complete() ONLY if the ENTIRE goal is achieved.\n"
f"4. You have {budget} tool calls. "
"Reserve the last 1-2 for handoff() or complete()."
)
def _build_user_message(
goal: str,
step: int,
max_steps: int,
handoff: HandoffState,
correction: str | None = None,
) -> str:
"""Build the user message for a subagent step using templates."""
budget = _step_budget(step, max_steps)
budget_note = (
f"\n\n---\n"
f"Step {step + 1}. You have {_STEP_BUDGET} tool calls total. "
f"Call handoff() or complete() before you run out."
f"Step {step + 1} of {max_steps}. You have {budget} tool calls for this step. "
f"Reserve the last 1-2 calls for handoff() or complete(). "
f"If you run out of calls without calling one, your progress is LOST."
)
if step == 0:
return goal + budget_note
return f"{goal}\n\n## Previous Progress\n{handoff}{budget_note}"
prompt = render_template(
"agent/long_task/step_start.md",
step=step,
max_steps=max_steps,
goal=goal,
budget=budget,
)
elif step >= max_steps - 3:
prompt = render_template(
"agent/long_task/step_final.md",
step=step,
max_steps=max_steps,
goal=goal,
budget=budget,
handoff=handoff,
)
else:
prompt = render_template(
"agent/long_task/step_middle.md",
step=step,
max_steps=max_steps,
goal=goal,
budget=budget,
handoff=handoff,
)
if correction:
prompt += f"\n\n## User Correction\n{correction}\n"
return prompt + budget_note
def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
@ -132,6 +238,42 @@ def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
return ""
def _extract_file_changes(
tool_events: list[dict[str, Any]],
) -> tuple[list[str], list[str]]:
"""Extract file creation/modification events from tool events."""
created: list[str] = []
modified: list[str] = []
for event in tool_events:
name = event.get("name", "")
status = event.get("status", "")
detail = event.get("detail", "")
if status != "ok":
continue
if name in ("write_file", "edit_file"):
# Try to extract file path from detail
if detail.startswith("Wrote ") or detail.startswith("Edited "):
path = detail.split(" ", 1)[1].split(":")[0].strip()
if name == "write_file":
created.append(path)
else:
modified.append(path)
return created, modified
# ---------------------------------------------------------------------------
# Observability: events and hooks
# ---------------------------------------------------------------------------
@dataclass
class LongTaskEvent:
"""Event emitted during long-task execution for observability."""
type: str
payload: dict[str, Any] = field(default_factory=dict)
timestamp: float = field(default_factory=time.time)
# ---------------------------------------------------------------------------
# Long Task Tool — the orchestrator
# ---------------------------------------------------------------------------
@ -152,14 +294,32 @@ class LongTaskTool(Tool):
def __init__(self, manager: SubagentManager) -> None:
self._manager = manager
self._hooks: dict[str, Any] = {}
self._reset_state()
@classmethod
def enabled(cls, ctx: ToolContext) -> bool:
return ctx.subagent_manager is not None
def _reset_state(self) -> None:
"""Reset internal state before a new execution.
@classmethod
def create(cls, ctx: ToolContext) -> Tool:
return cls(manager=ctx.subagent_manager)
Preserves any pending user corrections so inject_correction() can be
called before execute() starts.
"""
existing_signals = (
self._state.get("signal_queue", []) if hasattr(self, "_state") else []
)
self._state: dict[str, Any] = {
"current_step": 0,
"total_steps": 0,
"goal": "",
"status": "idle", # idle, running, validating, completed, error
"last_handoff": HandoffState(),
"cumulative_usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
},
"signal_queue": existing_signals,
"error": None,
}
@property
def name(self) -> str:
@ -176,61 +336,315 @@ class LongTaskTool(Tool):
"goal. For simple independent tasks, use spawn instead."
)
async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
handoff = ""
logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
for step in range(max_steps):
signal_store: dict[str, str] = {}
user_msg = _build_user_message(goal, step, handoff)
@classmethod
def enabled(cls, ctx: ToolContext) -> bool:
return ctx.subagent_manager is not None
@classmethod
def create(cls, ctx: ToolContext) -> Tool:
return cls(manager=ctx.subagent_manager)
# --- State exposure for WebUI observability ---
@property
def current_step(self) -> int:
return self._state["current_step"]
@property
def total_steps(self) -> int:
return self._state["total_steps"]
@property
def status(self) -> str:
return self._state["status"]
@property
def last_handoff(self) -> HandoffState:
return self._state["last_handoff"]
@property
def cumulative_usage(self) -> dict[str, int]:
return dict(self._state["cumulative_usage"])
@property
def goal(self) -> str:
return self._state["goal"]
# --- External signal mechanism (for user correction) ---
def inject_correction(self, message: str) -> None:
"""Inject a user correction message to be read before the next step."""
self._state["signal_queue"].append(message)
logger.info("LongTask correction injected: {}", message[:120])
def _pop_signal(self) -> str | None:
"""Consume and return the oldest pending correction, if any."""
if self._state["signal_queue"]:
return self._state["signal_queue"].pop(0)
return None
# --- Hook system for WebUI and logging ---
def set_hooks(self, hooks: dict[str, Any]) -> None:
"""Register observability hooks.
Supported hooks (all optional):
- on_task_start(goal, max_steps)
- on_step_start(step, goal, budget)
- on_step_complete(step, result, handoff)
- on_handoff(step, handoff)
- on_validation_started(step, completion_summary)
- on_validation_passed(step, summary)
- on_validation_failed(step, reason)
- on_task_complete(step, summary)
- on_task_error(step, error)
- on_event(event: LongTaskEvent) # catch-all
"""
self._hooks = dict(hooks)
def _emit(self, event_type: str, **payload: Any) -> None:
"""Emit an event to registered hooks."""
event = LongTaskEvent(type=event_type, payload=payload)
logger.debug("LongTask event: {} | {}", event_type, payload)
# Call catch-all hook
catch_all = self._hooks.get("on_event")
if catch_all is not None:
try:
result = await self._manager.run_step(
system_prompt=_LONG_TASK_SYSTEM_PROMPT,
catch_all(event)
except Exception:
logger.exception("LongTask on_event hook failed")
# Call specific hook
hook_name = f"on_{event_type}"
hook = self._hooks.get(hook_name)
if hook is not None:
try:
hook(**payload)
except Exception:
logger.exception("LongTask {} hook failed", hook_name)
# --- Core execution ---
async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
handoff = HandoffState()
self._reset_state()
self._state["goal"] = goal
self._state["total_steps"] = max_steps
self._state["status"] = "running"
logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
self._emit("task_start", goal=goal, max_steps=max_steps)
for step in range(max_steps):
self._state["current_step"] = step
signal_store = HandoffState()
correction = self._pop_signal()
user_msg = _build_user_message(
goal, step, max_steps, handoff, correction=correction
)
budget = _step_budget(step, max_steps)
self._emit("step_start", step=step, goal=goal, budget=budget)
# Run the step with retry on crash
result = await self._run_step_with_retry(
system_prompt=_build_system_prompt(budget),
user_message=user_msg,
extra_tools=[HandoffTool(signal_store), CompleteTool(signal_store)],
step=step,
budget=budget,
)
except Exception:
logger.exception("long_task step {}/{} failed", step + 1, max_steps)
if handoff:
if result is None:
# Fatal error after retry
self._state["status"] = "error"
self._emit("task_error", step=step, error=self._state["error"])
if handoff.message:
return (
f"Long task failed at step {step + 1}/{max_steps}. "
f"Last progress:\n{handoff}"
f"Last progress:\n{handoff.message}"
)
return f"Long task failed at step {step + 1}/{max_steps}."
sig_type = signal_store.get("type")
sig_payload = signal_store.get("payload", "")
# Accumulate usage
usage = getattr(result, "usage", {}) or {}
for key in ("prompt_tokens", "completion_tokens", "total_tokens"):
self._state["cumulative_usage"][key] += usage.get(key, 0)
# Extract file changes from tool events for automatic tracking
tool_events = getattr(result, "tool_events", []) or []
auto_created, auto_modified = _extract_file_changes(tool_events)
if auto_created or auto_modified:
logger.debug(
"long_task step {}: auto-detected files created={}, modified={}",
step + 1,
auto_created,
auto_modified,
)
self._emit("step_complete", step=step, result=result, handoff=signal_store)
# Determine signal from tool events
sig_type = "none"
for event in tool_events:
ev_name = event.get("name", "")
if ev_name == "complete":
sig_type = "complete"
break
elif ev_name == "handoff":
sig_type = "handoff"
break
# Fallback: if no explicit signal but CompleteTool/HandoffTool was
# called without arguments (message empty), use final_content
if sig_type == "none" and signal_store.message:
# Tool was called but we couldn't detect from events;
# use the store content as handoff
sig_type = "handoff"
elif sig_type == "none":
signal_store.message = _extract_handoff_from_messages(
getattr(result, "messages", []) or []
)
if signal_store.message:
sig_type = "handoff"
sig_payload = signal_store.message
logger.info(
"long_task step {}/{}: signal={}, stop_reason={}, tools={}",
step + 1, max_steps, sig_type or "auto",
step + 1,
max_steps,
sig_type,
result.stop_reason,
result.tools_used,
)
if sig_type == "complete":
logger.debug(
"long_task done at step {}: complete payload={:.200}",
step + 1, sig_payload,
)
return sig_payload
# Auto-extract progress — don't require handoff()
if sig_type == "handoff":
handoff = sig_payload
logger.debug("long_task step {} handoff: {:.200}", step + 1, handoff)
elif result.stop_reason == "completed":
# Subagent returned text naturally (no more tool calls)
handoff = result.final_content or ""
logger.debug(
"long_task step {} natural end: {:.200}",
step + 1, handoff[:200] if handoff else "(empty)",
if sig_type == "complete":
# Validation round
self._state["status"] = "validating"
self._emit(
"validation_started",
step=step,
completion_summary=sig_payload,
)
validated = await self._validate_completion(
goal, sig_payload, max_steps
)
if validated:
self._state["status"] = "completed"
self._emit("task_complete", step=step, summary=sig_payload)
return sig_payload
else:
# max_iterations hit — extract whatever text the subagent produced
handoff = _extract_handoff_from_messages(result.messages)
logger.debug(
"long_task step {} auto-extract: {:.200}",
step + 1, handoff[:200] if handoff else "(empty)",
self._emit(
"validation_failed",
step=step,
reason="Validation did not confirm completion",
)
logger.warning("long_task exhausted max_steps={}", max_steps)
# Fall through to handoff — continue working
handoff = signal_store
handoff.next_step_hint = (
f"Validation failed. Continue working toward the goal. "
f"Previous claim: {sig_payload}"
)
self._state["last_handoff"] = handoff
continue
elif sig_type == "handoff":
self._emit("handoff_received", step=step, handoff=signal_store)
# Merge auto-detected file changes if not explicitly reported
if auto_created and not signal_store.files_created:
signal_store.files_created = auto_created
if auto_modified and not signal_store.files_modified:
signal_store.files_modified = auto_modified
handoff = signal_store
self._state["last_handoff"] = handoff
continue
else:
# No signal — use extracted content as handoff
handoff = HandoffState(message=signal_store.message)
self._state["last_handoff"] = handoff
self._state["status"] = "error"
self._emit("task_error", step=max_steps, error="Max steps reached")
return (
f"Long task reached max steps ({max_steps}). "
f"Last progress:\n{handoff}"
f"Last progress:\n{handoff.message}"
)
async def _run_step_with_retry(
self,
system_prompt: str,
user_message: str,
extra_tools: list[Any],
step: int,
budget: int,
) -> Any:
"""Run a single step with one retry on crash."""
try:
return await self._manager.run_step(
system_prompt=system_prompt,
user_message=user_message,
extra_tools=extra_tools,
max_iterations=budget,
)
except Exception as first_err:
logger.warning(
"long_task step {}/{} crashed (will retry once): {}",
step + 1,
self._state["total_steps"],
first_err,
)
try:
return await self._manager.run_step(
system_prompt=system_prompt,
user_message=user_message,
extra_tools=extra_tools,
max_iterations=budget,
)
except Exception as second_err:
logger.exception(
"long_task step {}/{} failed after retry",
step + 1,
self._state["total_steps"],
)
self._state["error"] = str(second_err)
return None
async def _validate_completion(
self, goal: str, completion_summary: str, max_steps: int
) -> bool:
"""Run a validation step to verify the completion claim."""
try:
validation_store = HandoffState()
validation_prompt = render_template(
"agent/long_task/validation.md",
goal=goal,
completion_summary=completion_summary,
)
result = await self._manager.run_step(
system_prompt=validation_prompt,
user_message="Validate the claimed completion. "
"Call complete() if verified, handoff() if not.",
extra_tools=[
HandoffTool(validation_store),
CompleteTool(validation_store),
],
max_iterations=4, # Short validation step
)
# If complete() was called, validation passed
tool_events = getattr(result, "tool_events", []) or []
for event in tool_events:
if event.get("name") == "complete":
self._emit("validation_passed", summary=completion_summary)
return True
self._emit(
"validation_failed",
reason=validation_store.message or "Validator did not confirm",
)
return False
except Exception:
logger.exception("Validation step failed")
return False

View File

@ -0,0 +1,28 @@
# Long Task — FINAL Step {{ step + 1 }}/{{ max_steps }}
**This is one of the LAST steps. You are running out of budget.**
## Goal
{{ goal }}
## Previous Progress
{% if handoff.message %}
{{ handoff.message }}
{% endif %}
{% if handoff.files_created or handoff.files_modified %}
### Files Changed
{% for f in handoff.files_created %}
- Created: `{{ f }}`
{% endfor %}
{% for f in handoff.files_modified %}
- Modified: `{{ f }}`
{% endfor %}
{% endif %}
## Instructions
1. **Do NOT start new work**. Only finish what is already in progress.
2. **Wrap up**: Complete any partial work, write final results to files.
3. **Final handoff**: Call `handoff()` with a clear summary of what remains unfinished. Call `complete()` ONLY if you are 100% sure everything is done.
You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.

View File

@ -0,0 +1,38 @@
# Long Task — Step {{ step + 1 }}/{{ max_steps }}
You are one step in a chain working toward a goal.
## Goal
{{ goal }}
## Previous Progress
{% if handoff.message %}
{{ handoff.message }}
{% endif %}
{% if handoff.files_created or handoff.files_modified %}
### Files Changed
{% for f in handoff.files_created %}
- Created: `{{ f }}`
{% endfor %}
{% for f in handoff.files_modified %}
- Modified: `{{ f }}`
{% endfor %}
{% endif %}
{% if handoff.next_step_hint %}
### Suggested Next Step
{{ handoff.next_step_hint }}
{% endif %}
{% if handoff.verification %}
### Verification
{{ handoff.verification }}
{% endif %}
## Instructions
1. **Check existing work**: Use the file list above — do NOT re-explore files already handled.
2. **Do the next chunk**: Make concrete progress. Write results to files.
3. **Handoff**: Call `handoff()` with your progress summary, files changed, and a hint for the next step. Call `complete()` only if the ENTIRE goal is achieved.
You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.

View File

@ -0,0 +1,14 @@
# Long Task — Step {{ step + 1 }}/{{ max_steps }}
You are the FIRST step in a chain working toward a goal.
## Goal
{{ goal }}
## Instructions
1. **Explore first**: Check the filesystem to understand the current state. Do NOT assume anything.
2. **Plan your work**: Decide what chunk you will do in this step.
3. **Do the work**: Make concrete progress. Write results to files — do NOT just collect information without producing output.
4. **Handoff**: When done, call `handoff()` with a detailed summary. If the ENTIRE goal is already achieved, call `complete()` instead.
You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.

View File

@ -0,0 +1,16 @@
# Validation — Confirm Goal Completion
## Original Goal
{{ goal }}
## Claimed Completion
{{ completion_summary }}
## Instructions
You are a strict validator. Review the claimed completion against the original goal.
1. Check every requirement in the goal. Is each one actually satisfied by evidence (files created, tests passing, etc.)?
2. If ANY requirement is unproven or incomplete, call `handoff()` with what is missing.
3. If ALL requirements are satisfied with evidence, call `complete()` confirming the validation.
Be skeptical. "Looks correct" is not enough — verify against the filesystem.

View File

@ -2,53 +2,82 @@
import pytest
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock
from nanobot.agent.tools.long_task import (
HandoffState,
HandoffTool,
CompleteTool,
LongTaskTool,
LongTaskEvent,
_build_system_prompt,
_build_user_message,
_extract_file_changes,
_extract_handoff_from_messages,
)
# ---------------------------------------------------------------------------
# Signal tool tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_handoff_tool_stores_signal():
from nanobot.agent.tools.long_task import HandoffTool
store: dict[str, str] = {}
async def test_handoff_tool_stores_structured_signal():
store = HandoffState()
tool = HandoffTool(store)
result = await tool.execute(message="Processed items 1-8. Results in out.md. Continue with item 9.")
result = await tool.execute(
message="Processed items 1-8. Results in out.md.",
files_created=["out.md", "report.md"],
files_modified=["main.py"],
next_step_hint="Continue with item 9.",
verification="Tests passed",
)
assert result == "Progress recorded. The next step will continue from here."
assert store["type"] == "handoff"
assert store["payload"] == "Processed items 1-8. Results in out.md. Continue with item 9."
assert store.message == "Processed items 1-8. Results in out.md."
assert store.files_created == ["out.md", "report.md"]
assert store.files_modified == ["main.py"]
assert store.next_step_hint == "Continue with item 9."
assert store.verification == "Tests passed"
@pytest.mark.asyncio
async def test_handoff_tool_defaults_optional_fields():
store = HandoffState()
tool = HandoffTool(store)
await tool.execute(message="Done.")
assert store.files_created == []
assert store.files_modified == []
assert store.next_step_hint == ""
assert store.verification == ""
@pytest.mark.asyncio
async def test_complete_tool_stores_signal():
from nanobot.agent.tools.long_task import CompleteTool
store: dict[str, str] = {}
store = HandoffState()
tool = CompleteTool(store)
result = await tool.execute(summary="All 100 items processed. Summary in report.md")
assert result == "Task marked as complete."
assert store["type"] == "complete"
assert store["payload"] == "All 100 items processed. Summary in report.md"
assert result == "Task marked as complete. Awaiting validation."
assert store.message == "All 100 items processed. Summary in report.md"
@pytest.mark.asyncio
async def test_signal_tools_overwrite_on_multiple_calls():
"""Last call wins -- the orchestrator only reads the final signal."""
from nanobot.agent.tools.long_task import HandoffTool, CompleteTool
store: dict[str, str] = {}
"""Last call wins — the orchestrator only reads the final signal."""
store = HandoffState()
handoff = HandoffTool(store)
complete = CompleteTool(store)
await handoff.execute(message="first progress")
assert store["type"] == "handoff"
assert store.message == "first progress"
await complete.execute(summary="done early")
assert store["type"] == "complete"
assert store["payload"] == "done early"
assert store.message == "done early"
# ---------------------------------------------------------------------------
# Helper: minimal SubagentManager stub
# ---------------------------------------------------------------------------
def _make_manager_stub():
"""Create a minimal SubagentManager stub with a mockable run_step."""
mgr = MagicMock()
@ -64,6 +93,7 @@ def _step_result(**overrides):
tool_events=[],
stop_reason="completed",
tools_used=[],
usage={},
)
defaults.update(overrides)
return SimpleNamespace(**defaults)
@ -76,68 +106,150 @@ def _step_result(**overrides):
@pytest.mark.asyncio
async def test_long_task_completes_in_one_step():
"""Subagent calls complete() immediately."""
from nanobot.agent.tools.long_task import LongTaskTool
"""Subagent calls complete() immediately, validation passes."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools):
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All done. Report in summary.md")
return _step_result(
final_content="All done.",
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Validation round
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Audit all issues.")
assert result == "All done. Report in summary.md"
assert call_count == 2 # main step + validation
@pytest.mark.asyncio
async def test_long_task_completes_after_multiple_handoffs():
"""Subagent calls handoff() twice then complete()."""
from nanobot.agent.tools.long_task import LongTaskTool
"""Subagent calls handoff() twice then complete(), validation passes."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools):
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Processed 1-8.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
elif call_count == 2:
assert "Processed 1-8." in user_message
assert "8 tool calls" in user_message
assert "Step 2" in user_message or "Step 2 of" in user_message
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Processed 9-16.")
else:
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
elif call_count == 3:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All 16 items audited.")
return _step_result(tools_used=["handoff"])
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Validation round
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Audit 16 issues.")
assert result == "All 16 items audited."
assert call_count == 3
assert call_count == 4 # 3 main steps + validation
@pytest.mark.asyncio
async def test_long_task_validation_falls_back_to_handoff():
"""Subagent claims complete but validation fails — task continues."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
# First step claims complete
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
elif call_count == 2:
# Validation round fails (handoff called)
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Not actually done. Need more work.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
elif call_count == 3:
# Continue and complete for real
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Really done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Second validation passes
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Do something.", max_steps=5)
assert "Really done." == result
assert call_count == 4
@pytest.mark.asyncio
async def test_long_task_fallback_when_no_signal_called():
"""Subagent doesn't call handoff/complete — extract progress from messages."""
from nanobot.agent.tools.long_task import LongTaskTool
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools):
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
return _step_result(
final_content="Tool budget exhausted.",
messages=[
@ -161,57 +273,227 @@ async def test_long_task_fallback_when_no_signal_called():
@pytest.mark.asyncio
async def test_long_task_auto_extracts_on_natural_end():
"""Subagent finishes naturally (stop_reason=completed) without calling signal."""
from nanobot.agent.tools.long_task import LongTaskTool
mgr = _make_manager_stub()
steps = 0
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools):
nonlocal steps
steps += 1
if steps == 1:
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
return _step_result(
final_content="I processed items 1-5. Results in out.md.",
stop_reason="completed",
)
# Second step: subagent calls complete
elif call_count == 2:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All done.")
return _step_result(
final_content="All done.",
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Validation
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Process items.", max_steps=5)
assert "All done." == result
assert steps == 2
assert call_count == 3
@pytest.mark.asyncio
async def test_long_task_goal_appears_in_system_prompt():
"""Verify every step's system_prompt contains the long task system prompt."""
from nanobot.agent.tools.long_task import LongTaskTool
async def test_long_task_retries_on_crash():
"""A step that crashes once should be retried."""
mgr = _make_manager_stub()
captured_prompts = []
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools):
captured_prompts.append(system_prompt)
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
raise RuntimeError("Simulated crash")
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="done")
return _step_result(final_content="done")
await t.execute(summary="Recovered.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Audit everything.")
assert len(captured_prompts) == 1
assert "handoff()" in captured_prompts[0]
assert "complete()" in captured_prompts[0]
assert "filesystem" in captured_prompts[0]
result = await tool.execute(goal="Test retry.")
assert "Recovered." == result
assert call_count == 3 # main step + retry + validation
@pytest.mark.asyncio
async def test_long_task_fails_after_two_crashes():
"""A step that crashes twice should terminate the task."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
raise RuntimeError("Persistent crash")
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Test failure.", max_steps=3)
assert "failed at step 1/3" in result
assert tool.status == "error"
@pytest.mark.asyncio
async def test_long_task_uses_dynamic_budget():
"""Final steps should use lower max_iterations."""
mgr = _make_manager_stub()
captured_budgets = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_budgets.append(max_iterations)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Test budget.", max_steps=5)
# Step 0-2 should use 8, step 3+ should use 4
# But we complete on step 0, so only one budget captured
assert captured_budgets[0] == 8
# ---------------------------------------------------------------------------
# Hook and observability tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_hooks_receive_events():
"""Registered hooks should be called during execution."""
mgr = _make_manager_stub()
events = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.set_hooks({
"on_task_start": lambda **kw: events.append(("task_start", kw)),
"on_step_start": lambda **kw: events.append(("step_start", kw)),
"on_step_complete": lambda **kw: events.append(("step_complete", kw)),
"on_validation_started": lambda **kw: events.append(("validation_started", kw)),
"on_task_complete": lambda **kw: events.append(("task_complete", kw)),
})
await tool.execute(goal="Test hooks.")
assert any(e[0] == "task_start" for e in events)
assert any(e[0] == "step_start" for e in events)
assert any(e[0] == "step_complete" for e in events)
assert any(e[0] == "validation_started" for e in events)
assert any(e[0] == "task_complete" for e in events)
@pytest.mark.asyncio
async def test_catch_all_hook_receives_events():
"""The on_event catch-all hook should receive all events."""
mgr = _make_manager_stub()
events = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.set_hooks({
"on_event": lambda ev: events.append(ev.type),
})
await tool.execute(goal="Test catch-all.")
assert "task_start" in events
assert "step_start" in events
assert "task_complete" in events
@pytest.mark.asyncio
async def test_state_exposure():
"""Properties should reflect current execution state."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Progress.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
assert tool.status == "idle"
# Start execution in background so we can inspect mid-run
import asyncio
task = asyncio.create_task(tool.execute(goal="Test state.", max_steps=3))
# Give it a moment to start
await asyncio.sleep(0.01)
# Task should have finished by now since mocks are instant
await task
assert tool.goal == "Test state."
assert tool.total_steps == 3
assert tool.status == "error" # max_steps reached
assert tool.last_handoff.message == "Progress."
@pytest.mark.asyncio
async def test_inject_correction():
"""User correction should appear in the next step's user message."""
mgr = _make_manager_stub()
captured_messages = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_messages.append(user_message)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.inject_correction("Focus on error handling.")
await tool.execute(goal="Refactor code.")
assert any("Focus on error handling" in msg for msg in captured_messages)
# ---------------------------------------------------------------------------
@ -219,30 +501,48 @@ async def test_long_task_goal_appears_in_system_prompt():
# ---------------------------------------------------------------------------
def test_build_user_message_step_0():
from nanobot.agent.tools.long_task import _build_user_message
def test_build_system_prompt():
prompt = _build_system_prompt(budget=8)
assert "handoff()" in prompt
assert "complete()" in prompt
assert "8 tool calls" in prompt
msg = _build_user_message("Audit all issues.", step=0, handoff="")
assert msg.startswith("Audit all issues.")
assert "Step 1" in msg
def test_build_user_message_step_0():
msg = _build_user_message("Audit all issues.", step=0, max_steps=20, handoff=HandoffState())
assert "Audit all issues." in msg
assert "Step 1 of 20" in msg
assert "8 tool calls" in msg
assert "Previous Progress" not in msg
def test_build_user_message_later_step():
from nanobot.agent.tools.long_task import _build_user_message
msg = _build_user_message("Audit all issues.", step=3, handoff="Did 1-10.")
handoff = HandoffState(message="Did 1-10.", files_created=["a.py"], next_step_hint="Do Y")
msg = _build_user_message("Audit all issues.", step=3, max_steps=20, handoff=handoff)
assert "Audit all issues." in msg
assert "Previous Progress" in msg
assert "Did 1-10." in msg
assert "Step 4" in msg
assert "8 tool calls" in msg
assert "a.py" in msg
assert "Do Y" in msg
assert "Step 4 of 20" in msg
def test_build_user_message_final_step():
handoff = HandoffState(message="Almost done.")
msg = _build_user_message("Audit all issues.", step=18, max_steps=20, handoff=handoff)
assert "FINAL Step" in msg
assert "4 tool calls" in msg # final steps use lower budget
def test_build_user_message_with_correction():
msg = _build_user_message(
"Audit.", step=0, max_steps=20, handoff=HandoffState(), correction="Skip file A"
)
assert "User Correction" in msg
assert "Skip file A" in msg
def test_extract_handoff_from_messages():
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "do it"},
@ -254,8 +554,6 @@ def test_extract_handoff_from_messages():
def test_extract_handoff_skips_budget_message():
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "do it"},
@ -268,12 +566,26 @@ def test_extract_handoff_skips_budget_message():
def test_extract_handoff_from_empty_messages():
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
assert _extract_handoff_from_messages([]) == ""
assert _extract_handoff_from_messages([{"role": "system", "content": "sys"}]) == ""
def test_extract_file_changes_from_tool_events():
events = [
{"name": "write_file", "status": "ok", "detail": "Wrote /workspace/a.py: done"},
{"name": "edit_file", "status": "ok", "detail": "Edited /workspace/b.py: patched"},
{"name": "read_file", "status": "ok", "detail": "Read /workspace/c.py"},
{"name": "write_file", "status": "error", "detail": "Failed"},
]
created, modified = _extract_file_changes(events)
assert created == ["/workspace/a.py"]
assert modified == ["/workspace/b.py"]
def test_extract_file_changes_empty():
assert _extract_file_changes([]) == ([], [])
# ---------------------------------------------------------------------------
# Integration: verify LongTaskTool is wired into the main agent loop
# ---------------------------------------------------------------------------