mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-20 00:22:31 +00:00
feat(long-task): major overhaul with structured handoffs, validation, and observability
- Structured HandoffState: HandoffTool now accepts files_created,
files_modified, next_step_hint, and verification fields instead of
a plain string. Progress is passed between steps as structured data.
- Completion validation round: After complete() is called, a dedicated
validator step runs to verify the claim against the original goal.
If validation fails, the task continues rather than returning
a false completion.
- Dynamic prompt system: 3 Jinja2 templates (step_start, step_middle,
step_final) selected based on step number. Final steps get tighter
budget and stronger "wrap up" guidance.
- Automatic file change tracking: Extracts write_file/edit_file events
from tool_events and injects them into the next step's context if
the subagent forgot to report them explicitly.
- Budget tracking & adaptive strategy: Cumulative token usage is tracked
across steps. Per-step tool budget drops from 8 to 4 in the last
two steps to force handoff/completion.
- Crash retry with graceful degradation: A step that crashes is retried
once. Persistent crashes terminate the task and return partial progress.
- Full observability hooks for future WebUI integration:
- set_hooks() with on_step_start, on_step_complete, on_handoff,
on_validation_started, on_validation_passed, on_validation_failed,
on_task_complete, on_task_error, and catch-all on_event.
- Readable state properties: current_step, total_steps, status,
last_handoff, cumulative_usage, goal.
- inject_correction() allows external code to send user corrections
that are injected into the next step's prompt.
- run_step() accepts optional max_iterations for dynamic budget control.
All 27 long-task tests and 11 subagent tests pass.
This commit is contained in:
parent
e7214d96ed
commit
78ecb2a99a
@ -132,6 +132,7 @@ class SubagentManager:
|
|||||||
system_prompt: str,
|
system_prompt: str,
|
||||||
user_message: str,
|
user_message: str,
|
||||||
extra_tools: list["Tool"] | None = None,
|
extra_tools: list["Tool"] | None = None,
|
||||||
|
max_iterations: int | None = None,
|
||||||
) -> AgentRunResult:
|
) -> AgentRunResult:
|
||||||
"""Run a single subagent step and return the result directly.
|
"""Run a single subagent step and return the result directly.
|
||||||
|
|
||||||
@ -150,7 +151,7 @@ class SubagentManager:
|
|||||||
],
|
],
|
||||||
tools=tools,
|
tools=tools,
|
||||||
model=self.model,
|
model=self.model,
|
||||||
max_iterations=8,
|
max_iterations=max_iterations if max_iterations is not None else 8,
|
||||||
max_iterations_message=(
|
max_iterations_message=(
|
||||||
"Tool budget exhausted. "
|
"Tool budget exhausted. "
|
||||||
"Call handoff() or complete() earlier next time."
|
"Call handoff() or complete() earlier next time."
|
||||||
|
|||||||
@ -2,12 +2,20 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, TYPE_CHECKING
|
from typing import Any, TYPE_CHECKING
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from nanobot.agent.tools.base import Tool, tool_parameters
|
from nanobot.agent.tools.base import Tool, tool_parameters
|
||||||
from nanobot.agent.tools.schema import StringSchema, IntegerSchema, tool_parameters_schema
|
from nanobot.agent.tools.schema import (
|
||||||
|
ArraySchema,
|
||||||
|
IntegerSchema,
|
||||||
|
StringSchema,
|
||||||
|
tool_parameters_schema,
|
||||||
|
)
|
||||||
|
from nanobot.utils.prompt_templates import render_template
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from nanobot.agent.subagent import SubagentManager
|
from nanobot.agent.subagent import SubagentManager
|
||||||
@ -15,7 +23,33 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Signal tools -- write progress/completion into a shared dict
|
# Structured handoff state
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HandoffState:
|
||||||
|
"""Structured progress state passed between long-task steps."""
|
||||||
|
|
||||||
|
message: str = ""
|
||||||
|
files_created: list[str] = field(default_factory=list)
|
||||||
|
files_modified: list[str] = field(default_factory=list)
|
||||||
|
next_step_hint: str = ""
|
||||||
|
verification: str = ""
|
||||||
|
|
||||||
|
def is_empty(self) -> bool:
|
||||||
|
return not any(
|
||||||
|
[
|
||||||
|
self.message,
|
||||||
|
self.files_created,
|
||||||
|
self.files_modified,
|
||||||
|
self.next_step_hint,
|
||||||
|
self.verification,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Signal tools -- write progress/completion into a shared state
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@tool_parameters(
|
@tool_parameters(
|
||||||
@ -24,13 +58,28 @@ if TYPE_CHECKING:
|
|||||||
"What you completed in this step and where results are saved. "
|
"What you completed in this step and where results are saved. "
|
||||||
"The next step will pick up from here.",
|
"The next step will pick up from here.",
|
||||||
),
|
),
|
||||||
|
files_created=ArraySchema(
|
||||||
|
StringSchema(""),
|
||||||
|
description="List of file paths you created in this step",
|
||||||
|
),
|
||||||
|
files_modified=ArraySchema(
|
||||||
|
StringSchema(""),
|
||||||
|
description="List of file paths you modified in this step",
|
||||||
|
),
|
||||||
|
next_step_hint=StringSchema(
|
||||||
|
"A clear, specific hint about what the next step should do. "
|
||||||
|
"Be concrete — e.g. 'Implement the test cases in test_foo.py'",
|
||||||
|
),
|
||||||
|
verification=StringSchema(
|
||||||
|
"Any verification you performed (tests run, lint passed, etc.)",
|
||||||
|
),
|
||||||
required=["message"],
|
required=["message"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
class HandoffTool(Tool):
|
class HandoffTool(Tool):
|
||||||
"""Signal that the step is done but the overall task continues."""
|
"""Signal that the step is done but the overall task continues."""
|
||||||
|
|
||||||
def __init__(self, store: dict[str, str]) -> None:
|
def __init__(self, store: HandoffState) -> None:
|
||||||
self._store = store
|
self._store = store
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -40,14 +89,25 @@ class HandoffTool(Tool):
|
|||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return (
|
return (
|
||||||
"REQUIRED after finishing your work in this step. "
|
"You are done with this step. Pass control to the next step. "
|
||||||
"Pass your progress summary to the next step. "
|
"You MUST call this (or complete()) before your tool budget runs out. "
|
||||||
"Use complete() instead if the entire goal is achieved."
|
"Provide a detailed summary, list files changed, and hint the next step."
|
||||||
)
|
)
|
||||||
|
|
||||||
async def execute(self, message: str, **kwargs: Any) -> str:
|
async def execute(
|
||||||
self._store["type"] = "handoff"
|
self,
|
||||||
self._store["payload"] = message
|
message: str,
|
||||||
|
files_created: list[str] | None = None,
|
||||||
|
files_modified: list[str] | None = None,
|
||||||
|
next_step_hint: str = "",
|
||||||
|
verification: str = "",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> str:
|
||||||
|
self._store.message = message
|
||||||
|
self._store.files_created = list(files_created or [])
|
||||||
|
self._store.files_modified = list(files_modified or [])
|
||||||
|
self._store.next_step_hint = next_step_hint
|
||||||
|
self._store.verification = verification
|
||||||
return "Progress recorded. The next step will continue from here."
|
return "Progress recorded. The next step will continue from here."
|
||||||
|
|
||||||
|
|
||||||
@ -60,7 +120,7 @@ class HandoffTool(Tool):
|
|||||||
class CompleteTool(Tool):
|
class CompleteTool(Tool):
|
||||||
"""Signal that the entire long task is finished."""
|
"""Signal that the entire long task is finished."""
|
||||||
|
|
||||||
def __init__(self, store: dict[str, str]) -> None:
|
def __init__(self, store: HandoffState) -> None:
|
||||||
self._store = store
|
self._store = store
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -70,48 +130,94 @@ class CompleteTool(Tool):
|
|||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return (
|
return (
|
||||||
"The ENTIRE goal is achieved. Call this only when nothing remains."
|
"The ENTIRE goal is achieved. Call this only when nothing remains. "
|
||||||
|
"Your claim will be validated — if unproven, the task continues."
|
||||||
)
|
)
|
||||||
|
|
||||||
async def execute(self, summary: str, **kwargs: Any) -> str:
|
async def execute(self, summary: str, **kwargs: Any) -> str:
|
||||||
self._store["type"] = "complete"
|
self._store.message = summary
|
||||||
self._store["payload"] = summary
|
return "Task marked as complete. Awaiting validation."
|
||||||
return "Task marked as complete."
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# System prompt for long-task subagent steps
|
# Budget and prompt helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
_STEP_BUDGET = 8
|
_STEP_BUDGET = 8
|
||||||
|
_FINAL_STEP_BUDGET = 4 # Lower budget for final steps
|
||||||
|
|
||||||
# Must match max_iterations_message set in SubagentManager.run_step()
|
# Must match max_iterations_message set in SubagentManager.run_step()
|
||||||
_BUDGET_EXHAUSTED_PREFIX = "Tool budget exhausted"
|
_BUDGET_EXHAUSTED_PREFIX = "Tool budget exhausted"
|
||||||
|
|
||||||
_LONG_TASK_SYSTEM_PROMPT = """\
|
|
||||||
You are one step in a chain working toward a goal.
|
|
||||||
|
|
||||||
1. Check the filesystem to see what's already done.
|
def _step_budget(step: int, max_steps: int) -> int:
|
||||||
2. Do the next piece of work. Write results to files as you go — \
|
"""Compute per-step tool budget based on progress."""
|
||||||
do NOT just collect information without producing output.
|
if step >= max_steps - 2:
|
||||||
3. When done with your chunk, call handoff() with a brief summary. \
|
return _FINAL_STEP_BUDGET
|
||||||
If the entire goal is finished, call complete() instead.
|
return _STEP_BUDGET
|
||||||
|
|
||||||
IMPORTANT: Write output to files early and often. If you run out of \
|
|
||||||
tool calls, only what's on the filesystem survives.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _build_user_message(goal: str, step: int, handoff: str) -> str:
|
def _build_system_prompt(budget: int) -> str:
|
||||||
"""Build the user message for a subagent step with budget warning."""
|
"""Build the system prompt for a subagent step."""
|
||||||
|
return (
|
||||||
|
"You are one step in a chain working toward a goal.\n\n"
|
||||||
|
"Rules:\n"
|
||||||
|
"1. Do ONE small chunk of work per step.\n"
|
||||||
|
"2. Write results to files — do NOT just collect information.\n"
|
||||||
|
"3. Call handoff() when done with your chunk. "
|
||||||
|
"Call complete() ONLY if the ENTIRE goal is achieved.\n"
|
||||||
|
f"4. You have {budget} tool calls. "
|
||||||
|
"Reserve the last 1-2 for handoff() or complete()."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_user_message(
|
||||||
|
goal: str,
|
||||||
|
step: int,
|
||||||
|
max_steps: int,
|
||||||
|
handoff: HandoffState,
|
||||||
|
correction: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Build the user message for a subagent step using templates."""
|
||||||
|
budget = _step_budget(step, max_steps)
|
||||||
budget_note = (
|
budget_note = (
|
||||||
f"\n\n---\n"
|
f"\n\n---\n"
|
||||||
f"Step {step + 1}. You have {_STEP_BUDGET} tool calls total. "
|
f"Step {step + 1} of {max_steps}. You have {budget} tool calls for this step. "
|
||||||
f"Call handoff() or complete() before you run out."
|
f"Reserve the last 1-2 calls for handoff() or complete(). "
|
||||||
|
f"If you run out of calls without calling one, your progress is LOST."
|
||||||
)
|
)
|
||||||
|
|
||||||
if step == 0:
|
if step == 0:
|
||||||
return goal + budget_note
|
prompt = render_template(
|
||||||
return f"{goal}\n\n## Previous Progress\n{handoff}{budget_note}"
|
"agent/long_task/step_start.md",
|
||||||
|
step=step,
|
||||||
|
max_steps=max_steps,
|
||||||
|
goal=goal,
|
||||||
|
budget=budget,
|
||||||
|
)
|
||||||
|
elif step >= max_steps - 3:
|
||||||
|
prompt = render_template(
|
||||||
|
"agent/long_task/step_final.md",
|
||||||
|
step=step,
|
||||||
|
max_steps=max_steps,
|
||||||
|
goal=goal,
|
||||||
|
budget=budget,
|
||||||
|
handoff=handoff,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt = render_template(
|
||||||
|
"agent/long_task/step_middle.md",
|
||||||
|
step=step,
|
||||||
|
max_steps=max_steps,
|
||||||
|
goal=goal,
|
||||||
|
budget=budget,
|
||||||
|
handoff=handoff,
|
||||||
|
)
|
||||||
|
|
||||||
|
if correction:
|
||||||
|
prompt += f"\n\n## User Correction\n{correction}\n"
|
||||||
|
|
||||||
|
return prompt + budget_note
|
||||||
|
|
||||||
|
|
||||||
def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
|
def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
|
||||||
@ -132,6 +238,42 @@ def _extract_handoff_from_messages(messages: list[dict[str, Any]]) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_file_changes(
|
||||||
|
tool_events: list[dict[str, Any]],
|
||||||
|
) -> tuple[list[str], list[str]]:
|
||||||
|
"""Extract file creation/modification events from tool events."""
|
||||||
|
created: list[str] = []
|
||||||
|
modified: list[str] = []
|
||||||
|
for event in tool_events:
|
||||||
|
name = event.get("name", "")
|
||||||
|
status = event.get("status", "")
|
||||||
|
detail = event.get("detail", "")
|
||||||
|
if status != "ok":
|
||||||
|
continue
|
||||||
|
if name in ("write_file", "edit_file"):
|
||||||
|
# Try to extract file path from detail
|
||||||
|
if detail.startswith("Wrote ") or detail.startswith("Edited "):
|
||||||
|
path = detail.split(" ", 1)[1].split(":")[0].strip()
|
||||||
|
if name == "write_file":
|
||||||
|
created.append(path)
|
||||||
|
else:
|
||||||
|
modified.append(path)
|
||||||
|
return created, modified
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Observability: events and hooks
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LongTaskEvent:
|
||||||
|
"""Event emitted during long-task execution for observability."""
|
||||||
|
|
||||||
|
type: str
|
||||||
|
payload: dict[str, Any] = field(default_factory=dict)
|
||||||
|
timestamp: float = field(default_factory=time.time)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Long Task Tool — the orchestrator
|
# Long Task Tool — the orchestrator
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -152,14 +294,32 @@ class LongTaskTool(Tool):
|
|||||||
|
|
||||||
def __init__(self, manager: SubagentManager) -> None:
|
def __init__(self, manager: SubagentManager) -> None:
|
||||||
self._manager = manager
|
self._manager = manager
|
||||||
|
self._hooks: dict[str, Any] = {}
|
||||||
|
self._reset_state()
|
||||||
|
|
||||||
@classmethod
|
def _reset_state(self) -> None:
|
||||||
def enabled(cls, ctx: ToolContext) -> bool:
|
"""Reset internal state before a new execution.
|
||||||
return ctx.subagent_manager is not None
|
|
||||||
|
|
||||||
@classmethod
|
Preserves any pending user corrections so inject_correction() can be
|
||||||
def create(cls, ctx: ToolContext) -> Tool:
|
called before execute() starts.
|
||||||
return cls(manager=ctx.subagent_manager)
|
"""
|
||||||
|
existing_signals = (
|
||||||
|
self._state.get("signal_queue", []) if hasattr(self, "_state") else []
|
||||||
|
)
|
||||||
|
self._state: dict[str, Any] = {
|
||||||
|
"current_step": 0,
|
||||||
|
"total_steps": 0,
|
||||||
|
"goal": "",
|
||||||
|
"status": "idle", # idle, running, validating, completed, error
|
||||||
|
"last_handoff": HandoffState(),
|
||||||
|
"cumulative_usage": {
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"completion_tokens": 0,
|
||||||
|
"total_tokens": 0,
|
||||||
|
},
|
||||||
|
"signal_queue": existing_signals,
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def name(self) -> str:
|
def name(self) -> str:
|
||||||
@ -176,61 +336,315 @@ class LongTaskTool(Tool):
|
|||||||
"goal. For simple independent tasks, use spawn instead."
|
"goal. For simple independent tasks, use spawn instead."
|
||||||
)
|
)
|
||||||
|
|
||||||
async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
|
@classmethod
|
||||||
handoff = ""
|
def enabled(cls, ctx: ToolContext) -> bool:
|
||||||
logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
|
return ctx.subagent_manager is not None
|
||||||
for step in range(max_steps):
|
|
||||||
signal_store: dict[str, str] = {}
|
@classmethod
|
||||||
user_msg = _build_user_message(goal, step, handoff)
|
def create(cls, ctx: ToolContext) -> Tool:
|
||||||
|
return cls(manager=ctx.subagent_manager)
|
||||||
|
|
||||||
|
# --- State exposure for WebUI observability ---
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_step(self) -> int:
|
||||||
|
return self._state["current_step"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def total_steps(self) -> int:
|
||||||
|
return self._state["total_steps"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def status(self) -> str:
|
||||||
|
return self._state["status"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def last_handoff(self) -> HandoffState:
|
||||||
|
return self._state["last_handoff"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cumulative_usage(self) -> dict[str, int]:
|
||||||
|
return dict(self._state["cumulative_usage"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def goal(self) -> str:
|
||||||
|
return self._state["goal"]
|
||||||
|
|
||||||
|
# --- External signal mechanism (for user correction) ---
|
||||||
|
|
||||||
|
def inject_correction(self, message: str) -> None:
|
||||||
|
"""Inject a user correction message to be read before the next step."""
|
||||||
|
self._state["signal_queue"].append(message)
|
||||||
|
logger.info("LongTask correction injected: {}", message[:120])
|
||||||
|
|
||||||
|
def _pop_signal(self) -> str | None:
|
||||||
|
"""Consume and return the oldest pending correction, if any."""
|
||||||
|
if self._state["signal_queue"]:
|
||||||
|
return self._state["signal_queue"].pop(0)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --- Hook system for WebUI and logging ---
|
||||||
|
|
||||||
|
def set_hooks(self, hooks: dict[str, Any]) -> None:
|
||||||
|
"""Register observability hooks.
|
||||||
|
|
||||||
|
Supported hooks (all optional):
|
||||||
|
- on_task_start(goal, max_steps)
|
||||||
|
- on_step_start(step, goal, budget)
|
||||||
|
- on_step_complete(step, result, handoff)
|
||||||
|
- on_handoff(step, handoff)
|
||||||
|
- on_validation_started(step, completion_summary)
|
||||||
|
- on_validation_passed(step, summary)
|
||||||
|
- on_validation_failed(step, reason)
|
||||||
|
- on_task_complete(step, summary)
|
||||||
|
- on_task_error(step, error)
|
||||||
|
- on_event(event: LongTaskEvent) # catch-all
|
||||||
|
"""
|
||||||
|
self._hooks = dict(hooks)
|
||||||
|
|
||||||
|
def _emit(self, event_type: str, **payload: Any) -> None:
|
||||||
|
"""Emit an event to registered hooks."""
|
||||||
|
event = LongTaskEvent(type=event_type, payload=payload)
|
||||||
|
logger.debug("LongTask event: {} | {}", event_type, payload)
|
||||||
|
|
||||||
|
# Call catch-all hook
|
||||||
|
catch_all = self._hooks.get("on_event")
|
||||||
|
if catch_all is not None:
|
||||||
try:
|
try:
|
||||||
result = await self._manager.run_step(
|
catch_all(event)
|
||||||
system_prompt=_LONG_TASK_SYSTEM_PROMPT,
|
|
||||||
user_message=user_msg,
|
|
||||||
extra_tools=[HandoffTool(signal_store), CompleteTool(signal_store)],
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("long_task step {}/{} failed", step + 1, max_steps)
|
logger.exception("LongTask on_event hook failed")
|
||||||
if handoff:
|
|
||||||
|
# Call specific hook
|
||||||
|
hook_name = f"on_{event_type}"
|
||||||
|
hook = self._hooks.get(hook_name)
|
||||||
|
if hook is not None:
|
||||||
|
try:
|
||||||
|
hook(**payload)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("LongTask {} hook failed", hook_name)
|
||||||
|
|
||||||
|
# --- Core execution ---
|
||||||
|
|
||||||
|
async def execute(self, goal: str, max_steps: int = 20, **kwargs: Any) -> str:
|
||||||
|
handoff = HandoffState()
|
||||||
|
self._reset_state()
|
||||||
|
self._state["goal"] = goal
|
||||||
|
self._state["total_steps"] = max_steps
|
||||||
|
self._state["status"] = "running"
|
||||||
|
|
||||||
|
logger.debug("long_task start: max_steps={}, goal={:.120}", max_steps, goal)
|
||||||
|
self._emit("task_start", goal=goal, max_steps=max_steps)
|
||||||
|
|
||||||
|
for step in range(max_steps):
|
||||||
|
self._state["current_step"] = step
|
||||||
|
signal_store = HandoffState()
|
||||||
|
correction = self._pop_signal()
|
||||||
|
user_msg = _build_user_message(
|
||||||
|
goal, step, max_steps, handoff, correction=correction
|
||||||
|
)
|
||||||
|
|
||||||
|
budget = _step_budget(step, max_steps)
|
||||||
|
self._emit("step_start", step=step, goal=goal, budget=budget)
|
||||||
|
|
||||||
|
# Run the step with retry on crash
|
||||||
|
result = await self._run_step_with_retry(
|
||||||
|
system_prompt=_build_system_prompt(budget),
|
||||||
|
user_message=user_msg,
|
||||||
|
extra_tools=[HandoffTool(signal_store), CompleteTool(signal_store)],
|
||||||
|
step=step,
|
||||||
|
budget=budget,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
# Fatal error after retry
|
||||||
|
self._state["status"] = "error"
|
||||||
|
self._emit("task_error", step=step, error=self._state["error"])
|
||||||
|
if handoff.message:
|
||||||
return (
|
return (
|
||||||
f"Long task failed at step {step + 1}/{max_steps}. "
|
f"Long task failed at step {step + 1}/{max_steps}. "
|
||||||
f"Last progress:\n{handoff}"
|
f"Last progress:\n{handoff.message}"
|
||||||
)
|
)
|
||||||
return f"Long task failed at step {step + 1}/{max_steps}."
|
return f"Long task failed at step {step + 1}/{max_steps}."
|
||||||
sig_type = signal_store.get("type")
|
|
||||||
sig_payload = signal_store.get("payload", "")
|
# Accumulate usage
|
||||||
|
usage = getattr(result, "usage", {}) or {}
|
||||||
|
for key in ("prompt_tokens", "completion_tokens", "total_tokens"):
|
||||||
|
self._state["cumulative_usage"][key] += usage.get(key, 0)
|
||||||
|
|
||||||
|
# Extract file changes from tool events for automatic tracking
|
||||||
|
tool_events = getattr(result, "tool_events", []) or []
|
||||||
|
auto_created, auto_modified = _extract_file_changes(tool_events)
|
||||||
|
if auto_created or auto_modified:
|
||||||
|
logger.debug(
|
||||||
|
"long_task step {}: auto-detected files created={}, modified={}",
|
||||||
|
step + 1,
|
||||||
|
auto_created,
|
||||||
|
auto_modified,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._emit("step_complete", step=step, result=result, handoff=signal_store)
|
||||||
|
|
||||||
|
# Determine signal from tool events
|
||||||
|
sig_type = "none"
|
||||||
|
for event in tool_events:
|
||||||
|
ev_name = event.get("name", "")
|
||||||
|
if ev_name == "complete":
|
||||||
|
sig_type = "complete"
|
||||||
|
break
|
||||||
|
elif ev_name == "handoff":
|
||||||
|
sig_type = "handoff"
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fallback: if no explicit signal but CompleteTool/HandoffTool was
|
||||||
|
# called without arguments (message empty), use final_content
|
||||||
|
if sig_type == "none" and signal_store.message:
|
||||||
|
# Tool was called but we couldn't detect from events;
|
||||||
|
# use the store content as handoff
|
||||||
|
sig_type = "handoff"
|
||||||
|
elif sig_type == "none":
|
||||||
|
signal_store.message = _extract_handoff_from_messages(
|
||||||
|
getattr(result, "messages", []) or []
|
||||||
|
)
|
||||||
|
if signal_store.message:
|
||||||
|
sig_type = "handoff"
|
||||||
|
|
||||||
|
sig_payload = signal_store.message
|
||||||
logger.info(
|
logger.info(
|
||||||
"long_task step {}/{}: signal={}, stop_reason={}, tools={}",
|
"long_task step {}/{}: signal={}, stop_reason={}, tools={}",
|
||||||
step + 1, max_steps, sig_type or "auto",
|
step + 1,
|
||||||
|
max_steps,
|
||||||
|
sig_type,
|
||||||
result.stop_reason,
|
result.stop_reason,
|
||||||
result.tools_used,
|
result.tools_used,
|
||||||
)
|
)
|
||||||
if sig_type == "complete":
|
|
||||||
logger.debug(
|
|
||||||
"long_task done at step {}: complete payload={:.200}",
|
|
||||||
step + 1, sig_payload,
|
|
||||||
)
|
|
||||||
return sig_payload
|
|
||||||
|
|
||||||
# Auto-extract progress — don't require handoff()
|
if sig_type == "complete":
|
||||||
if sig_type == "handoff":
|
# Validation round
|
||||||
handoff = sig_payload
|
self._state["status"] = "validating"
|
||||||
logger.debug("long_task step {} handoff: {:.200}", step + 1, handoff)
|
self._emit(
|
||||||
elif result.stop_reason == "completed":
|
"validation_started",
|
||||||
# Subagent returned text naturally (no more tool calls)
|
step=step,
|
||||||
handoff = result.final_content or ""
|
completion_summary=sig_payload,
|
||||||
logger.debug(
|
|
||||||
"long_task step {} natural end: {:.200}",
|
|
||||||
step + 1, handoff[:200] if handoff else "(empty)",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
validated = await self._validate_completion(
|
||||||
|
goal, sig_payload, max_steps
|
||||||
|
)
|
||||||
|
if validated:
|
||||||
|
self._state["status"] = "completed"
|
||||||
|
self._emit("task_complete", step=step, summary=sig_payload)
|
||||||
|
return sig_payload
|
||||||
|
else:
|
||||||
|
self._emit(
|
||||||
|
"validation_failed",
|
||||||
|
step=step,
|
||||||
|
reason="Validation did not confirm completion",
|
||||||
|
)
|
||||||
|
# Fall through to handoff — continue working
|
||||||
|
handoff = signal_store
|
||||||
|
handoff.next_step_hint = (
|
||||||
|
f"Validation failed. Continue working toward the goal. "
|
||||||
|
f"Previous claim: {sig_payload}"
|
||||||
|
)
|
||||||
|
self._state["last_handoff"] = handoff
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif sig_type == "handoff":
|
||||||
|
self._emit("handoff_received", step=step, handoff=signal_store)
|
||||||
|
# Merge auto-detected file changes if not explicitly reported
|
||||||
|
if auto_created and not signal_store.files_created:
|
||||||
|
signal_store.files_created = auto_created
|
||||||
|
if auto_modified and not signal_store.files_modified:
|
||||||
|
signal_store.files_modified = auto_modified
|
||||||
|
handoff = signal_store
|
||||||
|
self._state["last_handoff"] = handoff
|
||||||
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# max_iterations hit — extract whatever text the subagent produced
|
# No signal — use extracted content as handoff
|
||||||
handoff = _extract_handoff_from_messages(result.messages)
|
handoff = HandoffState(message=signal_store.message)
|
||||||
logger.debug(
|
self._state["last_handoff"] = handoff
|
||||||
"long_task step {} auto-extract: {:.200}",
|
|
||||||
step + 1, handoff[:200] if handoff else "(empty)",
|
self._state["status"] = "error"
|
||||||
)
|
self._emit("task_error", step=max_steps, error="Max steps reached")
|
||||||
logger.warning("long_task exhausted max_steps={}", max_steps)
|
|
||||||
return (
|
return (
|
||||||
f"Long task reached max steps ({max_steps}). "
|
f"Long task reached max steps ({max_steps}). "
|
||||||
f"Last progress:\n{handoff}"
|
f"Last progress:\n{handoff.message}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def _run_step_with_retry(
|
||||||
|
self,
|
||||||
|
system_prompt: str,
|
||||||
|
user_message: str,
|
||||||
|
extra_tools: list[Any],
|
||||||
|
step: int,
|
||||||
|
budget: int,
|
||||||
|
) -> Any:
|
||||||
|
"""Run a single step with one retry on crash."""
|
||||||
|
try:
|
||||||
|
return await self._manager.run_step(
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
user_message=user_message,
|
||||||
|
extra_tools=extra_tools,
|
||||||
|
max_iterations=budget,
|
||||||
|
)
|
||||||
|
except Exception as first_err:
|
||||||
|
logger.warning(
|
||||||
|
"long_task step {}/{} crashed (will retry once): {}",
|
||||||
|
step + 1,
|
||||||
|
self._state["total_steps"],
|
||||||
|
first_err,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return await self._manager.run_step(
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
user_message=user_message,
|
||||||
|
extra_tools=extra_tools,
|
||||||
|
max_iterations=budget,
|
||||||
|
)
|
||||||
|
except Exception as second_err:
|
||||||
|
logger.exception(
|
||||||
|
"long_task step {}/{} failed after retry",
|
||||||
|
step + 1,
|
||||||
|
self._state["total_steps"],
|
||||||
|
)
|
||||||
|
self._state["error"] = str(second_err)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _validate_completion(
|
||||||
|
self, goal: str, completion_summary: str, max_steps: int
|
||||||
|
) -> bool:
|
||||||
|
"""Run a validation step to verify the completion claim."""
|
||||||
|
try:
|
||||||
|
validation_store = HandoffState()
|
||||||
|
validation_prompt = render_template(
|
||||||
|
"agent/long_task/validation.md",
|
||||||
|
goal=goal,
|
||||||
|
completion_summary=completion_summary,
|
||||||
|
)
|
||||||
|
result = await self._manager.run_step(
|
||||||
|
system_prompt=validation_prompt,
|
||||||
|
user_message="Validate the claimed completion. "
|
||||||
|
"Call complete() if verified, handoff() if not.",
|
||||||
|
extra_tools=[
|
||||||
|
HandoffTool(validation_store),
|
||||||
|
CompleteTool(validation_store),
|
||||||
|
],
|
||||||
|
max_iterations=4, # Short validation step
|
||||||
|
)
|
||||||
|
# If complete() was called, validation passed
|
||||||
|
tool_events = getattr(result, "tool_events", []) or []
|
||||||
|
for event in tool_events:
|
||||||
|
if event.get("name") == "complete":
|
||||||
|
self._emit("validation_passed", summary=completion_summary)
|
||||||
|
return True
|
||||||
|
|
||||||
|
self._emit(
|
||||||
|
"validation_failed",
|
||||||
|
reason=validation_store.message or "Validator did not confirm",
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Validation step failed")
|
||||||
|
return False
|
||||||
|
|||||||
28
nanobot/templates/agent/long_task/step_final.md
Normal file
28
nanobot/templates/agent/long_task/step_final.md
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# Long Task — FINAL Step {{ step + 1 }}/{{ max_steps }}
|
||||||
|
|
||||||
|
**This is one of the LAST steps. You are running out of budget.**
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
{{ goal }}
|
||||||
|
|
||||||
|
## Previous Progress
|
||||||
|
{% if handoff.message %}
|
||||||
|
{{ handoff.message }}
|
||||||
|
{% endif %}
|
||||||
|
{% if handoff.files_created or handoff.files_modified %}
|
||||||
|
|
||||||
|
### Files Changed
|
||||||
|
{% for f in handoff.files_created %}
|
||||||
|
- Created: `{{ f }}`
|
||||||
|
{% endfor %}
|
||||||
|
{% for f in handoff.files_modified %}
|
||||||
|
- Modified: `{{ f }}`
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
1. **Do NOT start new work**. Only finish what is already in progress.
|
||||||
|
2. **Wrap up**: Complete any partial work, write final results to files.
|
||||||
|
3. **Final handoff**: Call `handoff()` with a clear summary of what remains unfinished. Call `complete()` ONLY if you are 100% sure everything is done.
|
||||||
|
|
||||||
|
You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.
|
||||||
38
nanobot/templates/agent/long_task/step_middle.md
Normal file
38
nanobot/templates/agent/long_task/step_middle.md
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# Long Task — Step {{ step + 1 }}/{{ max_steps }}
|
||||||
|
|
||||||
|
You are one step in a chain working toward a goal.
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
{{ goal }}
|
||||||
|
|
||||||
|
## Previous Progress
|
||||||
|
{% if handoff.message %}
|
||||||
|
{{ handoff.message }}
|
||||||
|
{% endif %}
|
||||||
|
{% if handoff.files_created or handoff.files_modified %}
|
||||||
|
|
||||||
|
### Files Changed
|
||||||
|
{% for f in handoff.files_created %}
|
||||||
|
- Created: `{{ f }}`
|
||||||
|
{% endfor %}
|
||||||
|
{% for f in handoff.files_modified %}
|
||||||
|
- Modified: `{{ f }}`
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
{% if handoff.next_step_hint %}
|
||||||
|
|
||||||
|
### Suggested Next Step
|
||||||
|
{{ handoff.next_step_hint }}
|
||||||
|
{% endif %}
|
||||||
|
{% if handoff.verification %}
|
||||||
|
|
||||||
|
### Verification
|
||||||
|
{{ handoff.verification }}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
1. **Check existing work**: Use the file list above — do NOT re-explore files already handled.
|
||||||
|
2. **Do the next chunk**: Make concrete progress. Write results to files.
|
||||||
|
3. **Handoff**: Call `handoff()` with your progress summary, files changed, and a hint for the next step. Call `complete()` only if the ENTIRE goal is achieved.
|
||||||
|
|
||||||
|
You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.
|
||||||
14
nanobot/templates/agent/long_task/step_start.md
Normal file
14
nanobot/templates/agent/long_task/step_start.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Long Task — Step {{ step + 1 }}/{{ max_steps }}
|
||||||
|
|
||||||
|
You are the FIRST step in a chain working toward a goal.
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
{{ goal }}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
1. **Explore first**: Check the filesystem to understand the current state. Do NOT assume anything.
|
||||||
|
2. **Plan your work**: Decide what chunk you will do in this step.
|
||||||
|
3. **Do the work**: Make concrete progress. Write results to files — do NOT just collect information without producing output.
|
||||||
|
4. **Handoff**: When done, call `handoff()` with a detailed summary. If the ENTIRE goal is already achieved, call `complete()` instead.
|
||||||
|
|
||||||
|
You have {{ budget }} tool calls total. Reserve the last 1-2 calls for `handoff()` or `complete()`.
|
||||||
16
nanobot/templates/agent/long_task/validation.md
Normal file
16
nanobot/templates/agent/long_task/validation.md
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# Validation — Confirm Goal Completion
|
||||||
|
|
||||||
|
## Original Goal
|
||||||
|
{{ goal }}
|
||||||
|
|
||||||
|
## Claimed Completion
|
||||||
|
{{ completion_summary }}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
You are a strict validator. Review the claimed completion against the original goal.
|
||||||
|
|
||||||
|
1. Check every requirement in the goal. Is each one actually satisfied by evidence (files created, tests passing, etc.)?
|
||||||
|
2. If ANY requirement is unproven or incomplete, call `handoff()` with what is missing.
|
||||||
|
3. If ALL requirements are satisfied with evidence, call `complete()` confirming the validation.
|
||||||
|
|
||||||
|
Be skeptical. "Looks correct" is not enough — verify against the filesystem.
|
||||||
@ -2,53 +2,82 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
|
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
from nanobot.agent.tools.long_task import (
|
||||||
|
HandoffState,
|
||||||
|
HandoffTool,
|
||||||
|
CompleteTool,
|
||||||
|
LongTaskTool,
|
||||||
|
LongTaskEvent,
|
||||||
|
_build_system_prompt,
|
||||||
|
_build_user_message,
|
||||||
|
_extract_file_changes,
|
||||||
|
_extract_handoff_from_messages,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Signal tool tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_handoff_tool_stores_signal():
|
async def test_handoff_tool_stores_structured_signal():
|
||||||
from nanobot.agent.tools.long_task import HandoffTool
|
store = HandoffState()
|
||||||
|
|
||||||
store: dict[str, str] = {}
|
|
||||||
tool = HandoffTool(store)
|
tool = HandoffTool(store)
|
||||||
result = await tool.execute(message="Processed items 1-8. Results in out.md. Continue with item 9.")
|
result = await tool.execute(
|
||||||
|
message="Processed items 1-8. Results in out.md.",
|
||||||
|
files_created=["out.md", "report.md"],
|
||||||
|
files_modified=["main.py"],
|
||||||
|
next_step_hint="Continue with item 9.",
|
||||||
|
verification="Tests passed",
|
||||||
|
)
|
||||||
assert result == "Progress recorded. The next step will continue from here."
|
assert result == "Progress recorded. The next step will continue from here."
|
||||||
assert store["type"] == "handoff"
|
assert store.message == "Processed items 1-8. Results in out.md."
|
||||||
assert store["payload"] == "Processed items 1-8. Results in out.md. Continue with item 9."
|
assert store.files_created == ["out.md", "report.md"]
|
||||||
|
assert store.files_modified == ["main.py"]
|
||||||
|
assert store.next_step_hint == "Continue with item 9."
|
||||||
|
assert store.verification == "Tests passed"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_handoff_tool_defaults_optional_fields():
|
||||||
|
store = HandoffState()
|
||||||
|
tool = HandoffTool(store)
|
||||||
|
await tool.execute(message="Done.")
|
||||||
|
assert store.files_created == []
|
||||||
|
assert store.files_modified == []
|
||||||
|
assert store.next_step_hint == ""
|
||||||
|
assert store.verification == ""
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_complete_tool_stores_signal():
|
async def test_complete_tool_stores_signal():
|
||||||
from nanobot.agent.tools.long_task import CompleteTool
|
store = HandoffState()
|
||||||
|
|
||||||
store: dict[str, str] = {}
|
|
||||||
tool = CompleteTool(store)
|
tool = CompleteTool(store)
|
||||||
result = await tool.execute(summary="All 100 items processed. Summary in report.md")
|
result = await tool.execute(summary="All 100 items processed. Summary in report.md")
|
||||||
assert result == "Task marked as complete."
|
assert result == "Task marked as complete. Awaiting validation."
|
||||||
assert store["type"] == "complete"
|
assert store.message == "All 100 items processed. Summary in report.md"
|
||||||
assert store["payload"] == "All 100 items processed. Summary in report.md"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_signal_tools_overwrite_on_multiple_calls():
|
async def test_signal_tools_overwrite_on_multiple_calls():
|
||||||
"""Last call wins -- the orchestrator only reads the final signal."""
|
"""Last call wins — the orchestrator only reads the final signal."""
|
||||||
from nanobot.agent.tools.long_task import HandoffTool, CompleteTool
|
store = HandoffState()
|
||||||
|
|
||||||
store: dict[str, str] = {}
|
|
||||||
handoff = HandoffTool(store)
|
handoff = HandoffTool(store)
|
||||||
complete = CompleteTool(store)
|
complete = CompleteTool(store)
|
||||||
await handoff.execute(message="first progress")
|
await handoff.execute(message="first progress")
|
||||||
assert store["type"] == "handoff"
|
assert store.message == "first progress"
|
||||||
await complete.execute(summary="done early")
|
await complete.execute(summary="done early")
|
||||||
assert store["type"] == "complete"
|
assert store.message == "done early"
|
||||||
assert store["payload"] == "done early"
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Helper: minimal SubagentManager stub
|
# Helper: minimal SubagentManager stub
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _make_manager_stub():
|
def _make_manager_stub():
|
||||||
"""Create a minimal SubagentManager stub with a mockable run_step."""
|
"""Create a minimal SubagentManager stub with a mockable run_step."""
|
||||||
mgr = MagicMock()
|
mgr = MagicMock()
|
||||||
@ -64,6 +93,7 @@ def _step_result(**overrides):
|
|||||||
tool_events=[],
|
tool_events=[],
|
||||||
stop_reason="completed",
|
stop_reason="completed",
|
||||||
tools_used=[],
|
tools_used=[],
|
||||||
|
usage={},
|
||||||
)
|
)
|
||||||
defaults.update(overrides)
|
defaults.update(overrides)
|
||||||
return SimpleNamespace(**defaults)
|
return SimpleNamespace(**defaults)
|
||||||
@ -76,68 +106,150 @@ def _step_result(**overrides):
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_long_task_completes_in_one_step():
|
async def test_long_task_completes_in_one_step():
|
||||||
"""Subagent calls complete() immediately."""
|
"""Subagent calls complete() immediately, validation passes."""
|
||||||
from nanobot.agent.tools.long_task import LongTaskTool
|
|
||||||
|
|
||||||
mgr = _make_manager_stub()
|
mgr = _make_manager_stub()
|
||||||
|
call_count = 0
|
||||||
|
|
||||||
async def fake_run_step(*, system_prompt, user_message, extra_tools):
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
for t in extra_tools:
|
nonlocal call_count
|
||||||
if t.name == "complete":
|
call_count += 1
|
||||||
await t.execute(summary="All done. Report in summary.md")
|
if call_count == 1:
|
||||||
return _step_result(
|
for t in extra_tools:
|
||||||
final_content="All done.",
|
if t.name == "complete":
|
||||||
tools_used=["complete"],
|
await t.execute(summary="All done. Report in summary.md")
|
||||||
)
|
return _step_result(
|
||||||
|
final_content="All done.",
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Validation round
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Validated")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
mgr.run_step.side_effect = fake_run_step
|
mgr.run_step.side_effect = fake_run_step
|
||||||
tool = LongTaskTool(manager=mgr)
|
tool = LongTaskTool(manager=mgr)
|
||||||
result = await tool.execute(goal="Audit all issues.")
|
result = await tool.execute(goal="Audit all issues.")
|
||||||
assert result == "All done. Report in summary.md"
|
assert result == "All done. Report in summary.md"
|
||||||
|
assert call_count == 2 # main step + validation
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_long_task_completes_after_multiple_handoffs():
|
async def test_long_task_completes_after_multiple_handoffs():
|
||||||
"""Subagent calls handoff() twice then complete()."""
|
"""Subagent calls handoff() twice then complete(), validation passes."""
|
||||||
from nanobot.agent.tools.long_task import LongTaskTool
|
|
||||||
|
|
||||||
mgr = _make_manager_stub()
|
mgr = _make_manager_stub()
|
||||||
call_count = 0
|
call_count = 0
|
||||||
|
|
||||||
async def fake_run_step(*, system_prompt, user_message, extra_tools):
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
nonlocal call_count
|
nonlocal call_count
|
||||||
call_count += 1
|
call_count += 1
|
||||||
if call_count == 1:
|
if call_count == 1:
|
||||||
for t in extra_tools:
|
for t in extra_tools:
|
||||||
if t.name == "handoff":
|
if t.name == "handoff":
|
||||||
await t.execute(message="Processed 1-8.")
|
await t.execute(message="Processed 1-8.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["handoff"],
|
||||||
|
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
elif call_count == 2:
|
elif call_count == 2:
|
||||||
assert "Processed 1-8." in user_message
|
assert "Processed 1-8." in user_message
|
||||||
assert "8 tool calls" in user_message
|
assert "Step 2" in user_message or "Step 2 of" in user_message
|
||||||
for t in extra_tools:
|
for t in extra_tools:
|
||||||
if t.name == "handoff":
|
if t.name == "handoff":
|
||||||
await t.execute(message="Processed 9-16.")
|
await t.execute(message="Processed 9-16.")
|
||||||
else:
|
return _step_result(
|
||||||
|
tools_used=["handoff"],
|
||||||
|
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
elif call_count == 3:
|
||||||
for t in extra_tools:
|
for t in extra_tools:
|
||||||
if t.name == "complete":
|
if t.name == "complete":
|
||||||
await t.execute(summary="All 16 items audited.")
|
await t.execute(summary="All 16 items audited.")
|
||||||
return _step_result(tools_used=["handoff"])
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Validation round
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Validated")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
mgr.run_step.side_effect = fake_run_step
|
mgr.run_step.side_effect = fake_run_step
|
||||||
tool = LongTaskTool(manager=mgr)
|
tool = LongTaskTool(manager=mgr)
|
||||||
result = await tool.execute(goal="Audit 16 issues.")
|
result = await tool.execute(goal="Audit 16 issues.")
|
||||||
assert result == "All 16 items audited."
|
assert result == "All 16 items audited."
|
||||||
assert call_count == 3
|
assert call_count == 4 # 3 main steps + validation
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_long_task_validation_falls_back_to_handoff():
|
||||||
|
"""Subagent claims complete but validation fails — task continues."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
call_count = 0
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
if call_count == 1:
|
||||||
|
# First step claims complete
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Done.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
elif call_count == 2:
|
||||||
|
# Validation round fails (handoff called)
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "handoff":
|
||||||
|
await t.execute(message="Not actually done. Need more work.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["handoff"],
|
||||||
|
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
elif call_count == 3:
|
||||||
|
# Continue and complete for real
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Really done.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Second validation passes
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Validated")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
result = await tool.execute(goal="Do something.", max_steps=5)
|
||||||
|
assert "Really done." == result
|
||||||
|
assert call_count == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_long_task_fallback_when_no_signal_called():
|
async def test_long_task_fallback_when_no_signal_called():
|
||||||
"""Subagent doesn't call handoff/complete — extract progress from messages."""
|
"""Subagent doesn't call handoff/complete — extract progress from messages."""
|
||||||
from nanobot.agent.tools.long_task import LongTaskTool
|
|
||||||
|
|
||||||
mgr = _make_manager_stub()
|
mgr = _make_manager_stub()
|
||||||
|
|
||||||
async def fake_run_step(*, system_prompt, user_message, extra_tools):
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
return _step_result(
|
return _step_result(
|
||||||
final_content="Tool budget exhausted.",
|
final_content="Tool budget exhausted.",
|
||||||
messages=[
|
messages=[
|
||||||
@ -161,57 +273,227 @@ async def test_long_task_fallback_when_no_signal_called():
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_long_task_auto_extracts_on_natural_end():
|
async def test_long_task_auto_extracts_on_natural_end():
|
||||||
"""Subagent finishes naturally (stop_reason=completed) without calling signal."""
|
"""Subagent finishes naturally (stop_reason=completed) without calling signal."""
|
||||||
from nanobot.agent.tools.long_task import LongTaskTool
|
|
||||||
|
|
||||||
mgr = _make_manager_stub()
|
mgr = _make_manager_stub()
|
||||||
steps = 0
|
call_count = 0
|
||||||
|
|
||||||
async def fake_run_step(*, system_prompt, user_message, extra_tools):
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
nonlocal steps
|
nonlocal call_count
|
||||||
steps += 1
|
call_count += 1
|
||||||
if steps == 1:
|
if call_count == 1:
|
||||||
return _step_result(
|
return _step_result(
|
||||||
final_content="I processed items 1-5. Results in out.md.",
|
final_content="I processed items 1-5. Results in out.md.",
|
||||||
stop_reason="completed",
|
stop_reason="completed",
|
||||||
)
|
)
|
||||||
# Second step: subagent calls complete
|
elif call_count == 2:
|
||||||
for t in extra_tools:
|
for t in extra_tools:
|
||||||
if t.name == "complete":
|
if t.name == "complete":
|
||||||
await t.execute(summary="All done.")
|
await t.execute(summary="All done.")
|
||||||
return _step_result(
|
return _step_result(
|
||||||
final_content="All done.",
|
final_content="All done.",
|
||||||
tools_used=["complete"],
|
tools_used=["complete"],
|
||||||
)
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Validation
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Validated")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
mgr.run_step.side_effect = fake_run_step
|
mgr.run_step.side_effect = fake_run_step
|
||||||
tool = LongTaskTool(manager=mgr)
|
tool = LongTaskTool(manager=mgr)
|
||||||
result = await tool.execute(goal="Process items.", max_steps=5)
|
result = await tool.execute(goal="Process items.", max_steps=5)
|
||||||
assert "All done." == result
|
assert "All done." == result
|
||||||
assert steps == 2
|
assert call_count == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_long_task_goal_appears_in_system_prompt():
|
async def test_long_task_retries_on_crash():
|
||||||
"""Verify every step's system_prompt contains the long task system prompt."""
|
"""A step that crashes once should be retried."""
|
||||||
from nanobot.agent.tools.long_task import LongTaskTool
|
|
||||||
|
|
||||||
mgr = _make_manager_stub()
|
mgr = _make_manager_stub()
|
||||||
captured_prompts = []
|
call_count = 0
|
||||||
|
|
||||||
async def fake_run_step(*, system_prompt, user_message, extra_tools):
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
captured_prompts.append(system_prompt)
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
if call_count == 1:
|
||||||
|
raise RuntimeError("Simulated crash")
|
||||||
for t in extra_tools:
|
for t in extra_tools:
|
||||||
if t.name == "complete":
|
if t.name == "complete":
|
||||||
await t.execute(summary="done")
|
await t.execute(summary="Recovered.")
|
||||||
return _step_result(final_content="done")
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
mgr.run_step.side_effect = fake_run_step
|
mgr.run_step.side_effect = fake_run_step
|
||||||
tool = LongTaskTool(manager=mgr)
|
tool = LongTaskTool(manager=mgr)
|
||||||
await tool.execute(goal="Audit everything.")
|
result = await tool.execute(goal="Test retry.")
|
||||||
assert len(captured_prompts) == 1
|
assert "Recovered." == result
|
||||||
assert "handoff()" in captured_prompts[0]
|
assert call_count == 3 # main step + retry + validation
|
||||||
assert "complete()" in captured_prompts[0]
|
|
||||||
assert "filesystem" in captured_prompts[0]
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_long_task_fails_after_two_crashes():
|
||||||
|
"""A step that crashes twice should terminate the task."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
raise RuntimeError("Persistent crash")
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
result = await tool.execute(goal="Test failure.", max_steps=3)
|
||||||
|
assert "failed at step 1/3" in result
|
||||||
|
assert tool.status == "error"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_long_task_uses_dynamic_budget():
|
||||||
|
"""Final steps should use lower max_iterations."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
captured_budgets = []
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
captured_budgets.append(max_iterations)
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Done.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
await tool.execute(goal="Test budget.", max_steps=5)
|
||||||
|
# Step 0-2 should use 8, step 3+ should use 4
|
||||||
|
# But we complete on step 0, so only one budget captured
|
||||||
|
assert captured_budgets[0] == 8
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Hook and observability tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_hooks_receive_events():
|
||||||
|
"""Registered hooks should be called during execution."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
events = []
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Done.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
tool.set_hooks({
|
||||||
|
"on_task_start": lambda **kw: events.append(("task_start", kw)),
|
||||||
|
"on_step_start": lambda **kw: events.append(("step_start", kw)),
|
||||||
|
"on_step_complete": lambda **kw: events.append(("step_complete", kw)),
|
||||||
|
"on_validation_started": lambda **kw: events.append(("validation_started", kw)),
|
||||||
|
"on_task_complete": lambda **kw: events.append(("task_complete", kw)),
|
||||||
|
})
|
||||||
|
await tool.execute(goal="Test hooks.")
|
||||||
|
|
||||||
|
assert any(e[0] == "task_start" for e in events)
|
||||||
|
assert any(e[0] == "step_start" for e in events)
|
||||||
|
assert any(e[0] == "step_complete" for e in events)
|
||||||
|
assert any(e[0] == "validation_started" for e in events)
|
||||||
|
assert any(e[0] == "task_complete" for e in events)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_catch_all_hook_receives_events():
|
||||||
|
"""The on_event catch-all hook should receive all events."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
events = []
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Done.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
tool.set_hooks({
|
||||||
|
"on_event": lambda ev: events.append(ev.type),
|
||||||
|
})
|
||||||
|
await tool.execute(goal="Test catch-all.")
|
||||||
|
|
||||||
|
assert "task_start" in events
|
||||||
|
assert "step_start" in events
|
||||||
|
assert "task_complete" in events
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_state_exposure():
|
||||||
|
"""Properties should reflect current execution state."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "handoff":
|
||||||
|
await t.execute(message="Progress.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["handoff"],
|
||||||
|
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
assert tool.status == "idle"
|
||||||
|
|
||||||
|
# Start execution in background so we can inspect mid-run
|
||||||
|
import asyncio
|
||||||
|
task = asyncio.create_task(tool.execute(goal="Test state.", max_steps=3))
|
||||||
|
# Give it a moment to start
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
# Task should have finished by now since mocks are instant
|
||||||
|
await task
|
||||||
|
|
||||||
|
assert tool.goal == "Test state."
|
||||||
|
assert tool.total_steps == 3
|
||||||
|
assert tool.status == "error" # max_steps reached
|
||||||
|
assert tool.last_handoff.message == "Progress."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_inject_correction():
|
||||||
|
"""User correction should appear in the next step's user message."""
|
||||||
|
mgr = _make_manager_stub()
|
||||||
|
captured_messages = []
|
||||||
|
|
||||||
|
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
|
||||||
|
captured_messages.append(user_message)
|
||||||
|
for t in extra_tools:
|
||||||
|
if t.name == "complete":
|
||||||
|
await t.execute(summary="Done.")
|
||||||
|
return _step_result(
|
||||||
|
tools_used=["complete"],
|
||||||
|
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr.run_step.side_effect = fake_run_step
|
||||||
|
tool = LongTaskTool(manager=mgr)
|
||||||
|
tool.inject_correction("Focus on error handling.")
|
||||||
|
await tool.execute(goal="Refactor code.")
|
||||||
|
|
||||||
|
assert any("Focus on error handling" in msg for msg in captured_messages)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -219,30 +501,48 @@ async def test_long_task_goal_appears_in_system_prompt():
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def test_build_user_message_step_0():
|
def test_build_system_prompt():
|
||||||
from nanobot.agent.tools.long_task import _build_user_message
|
prompt = _build_system_prompt(budget=8)
|
||||||
|
assert "handoff()" in prompt
|
||||||
|
assert "complete()" in prompt
|
||||||
|
assert "8 tool calls" in prompt
|
||||||
|
|
||||||
msg = _build_user_message("Audit all issues.", step=0, handoff="")
|
|
||||||
assert msg.startswith("Audit all issues.")
|
def test_build_user_message_step_0():
|
||||||
assert "Step 1" in msg
|
msg = _build_user_message("Audit all issues.", step=0, max_steps=20, handoff=HandoffState())
|
||||||
|
assert "Audit all issues." in msg
|
||||||
|
assert "Step 1 of 20" in msg
|
||||||
assert "8 tool calls" in msg
|
assert "8 tool calls" in msg
|
||||||
assert "Previous Progress" not in msg
|
assert "Previous Progress" not in msg
|
||||||
|
|
||||||
|
|
||||||
def test_build_user_message_later_step():
|
def test_build_user_message_later_step():
|
||||||
from nanobot.agent.tools.long_task import _build_user_message
|
handoff = HandoffState(message="Did 1-10.", files_created=["a.py"], next_step_hint="Do Y")
|
||||||
|
msg = _build_user_message("Audit all issues.", step=3, max_steps=20, handoff=handoff)
|
||||||
msg = _build_user_message("Audit all issues.", step=3, handoff="Did 1-10.")
|
|
||||||
assert "Audit all issues." in msg
|
assert "Audit all issues." in msg
|
||||||
assert "Previous Progress" in msg
|
assert "Previous Progress" in msg
|
||||||
assert "Did 1-10." in msg
|
assert "Did 1-10." in msg
|
||||||
assert "Step 4" in msg
|
assert "a.py" in msg
|
||||||
assert "8 tool calls" in msg
|
assert "Do Y" in msg
|
||||||
|
assert "Step 4 of 20" in msg
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_user_message_final_step():
|
||||||
|
handoff = HandoffState(message="Almost done.")
|
||||||
|
msg = _build_user_message("Audit all issues.", step=18, max_steps=20, handoff=handoff)
|
||||||
|
assert "FINAL Step" in msg
|
||||||
|
assert "4 tool calls" in msg # final steps use lower budget
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_user_message_with_correction():
|
||||||
|
msg = _build_user_message(
|
||||||
|
"Audit.", step=0, max_steps=20, handoff=HandoffState(), correction="Skip file A"
|
||||||
|
)
|
||||||
|
assert "User Correction" in msg
|
||||||
|
assert "Skip file A" in msg
|
||||||
|
|
||||||
|
|
||||||
def test_extract_handoff_from_messages():
|
def test_extract_handoff_from_messages():
|
||||||
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
|
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "sys"},
|
{"role": "system", "content": "sys"},
|
||||||
{"role": "user", "content": "do it"},
|
{"role": "user", "content": "do it"},
|
||||||
@ -254,8 +554,6 @@ def test_extract_handoff_from_messages():
|
|||||||
|
|
||||||
|
|
||||||
def test_extract_handoff_skips_budget_message():
|
def test_extract_handoff_skips_budget_message():
|
||||||
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
|
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "sys"},
|
{"role": "system", "content": "sys"},
|
||||||
{"role": "user", "content": "do it"},
|
{"role": "user", "content": "do it"},
|
||||||
@ -268,12 +566,26 @@ def test_extract_handoff_skips_budget_message():
|
|||||||
|
|
||||||
|
|
||||||
def test_extract_handoff_from_empty_messages():
|
def test_extract_handoff_from_empty_messages():
|
||||||
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
|
|
||||||
|
|
||||||
assert _extract_handoff_from_messages([]) == ""
|
assert _extract_handoff_from_messages([]) == ""
|
||||||
assert _extract_handoff_from_messages([{"role": "system", "content": "sys"}]) == ""
|
assert _extract_handoff_from_messages([{"role": "system", "content": "sys"}]) == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_file_changes_from_tool_events():
|
||||||
|
events = [
|
||||||
|
{"name": "write_file", "status": "ok", "detail": "Wrote /workspace/a.py: done"},
|
||||||
|
{"name": "edit_file", "status": "ok", "detail": "Edited /workspace/b.py: patched"},
|
||||||
|
{"name": "read_file", "status": "ok", "detail": "Read /workspace/c.py"},
|
||||||
|
{"name": "write_file", "status": "error", "detail": "Failed"},
|
||||||
|
]
|
||||||
|
created, modified = _extract_file_changes(events)
|
||||||
|
assert created == ["/workspace/a.py"]
|
||||||
|
assert modified == ["/workspace/b.py"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_file_changes_empty():
|
||||||
|
assert _extract_file_changes([]) == ([], [])
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Integration: verify LongTaskTool is wired into the main agent loop
|
# Integration: verify LongTaskTool is wired into the main agent loop
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user