nanobot/tests/agent/tools/test_long_task.py
chengyongru bf5762a3d4 feat(long-task): add LongTaskTool for multi-step agent tasks
Implements a meta-ReAct loop where long-running tasks are broken into
sequential subagent steps, each starting fresh with the original goal
and progress from the previous step. This prevents context drift when
agents work on complex, multi-step tasks.

- Extract build_tool_registry() from SubagentManager for reuse
- Add run_step() for synchronous subagent execution (no bus announcement)
- Add HandoffTool and CompleteTool as signal mechanisms via shared dict
- Add LongTaskTool orchestrator with simplified prompt (8 iterations/step)
- Register LongTaskTool in main agent loop
- Add _extract_handoff_from_messages fallback for robustness
2026-05-12 23:37:00 +08:00

263 lines
9.3 KiB
Python

"""Tests for Long Task Tool: HandoffTool, CompleteTool, LongTaskTool."""
import pytest
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock
@pytest.mark.asyncio
async def test_handoff_tool_stores_signal():
from nanobot.agent.tools.long_task import HandoffTool
store: dict[str, str] = {}
tool = HandoffTool(store)
result = await tool.execute(message="Processed items 1-8. Results in out.md. Continue with item 9.")
assert result == "Progress recorded. The next step will continue from here."
assert store["type"] == "handoff"
assert store["payload"] == "Processed items 1-8. Results in out.md. Continue with item 9."
@pytest.mark.asyncio
async def test_complete_tool_stores_signal():
from nanobot.agent.tools.long_task import CompleteTool
store: dict[str, str] = {}
tool = CompleteTool(store)
result = await tool.execute(summary="All 100 items processed. Summary in report.md")
assert result == "Task marked as complete."
assert store["type"] == "complete"
assert store["payload"] == "All 100 items processed. Summary in report.md"
@pytest.mark.asyncio
async def test_signal_tools_overwrite_on_multiple_calls():
"""Last call wins -- the orchestrator only reads the final signal."""
from nanobot.agent.tools.long_task import HandoffTool, CompleteTool
store: dict[str, str] = {}
handoff = HandoffTool(store)
complete = CompleteTool(store)
await handoff.execute(message="first progress")
assert store["type"] == "handoff"
await complete.execute(summary="done early")
assert store["type"] == "complete"
assert store["payload"] == "done early"
# ---------------------------------------------------------------------------
# Helper: minimal SubagentManager stub
# ---------------------------------------------------------------------------
def _make_manager_stub():
"""Create a minimal SubagentManager stub with a mockable run_step."""
mgr = MagicMock()
mgr.run_step = AsyncMock()
return mgr
def _step_result(**overrides):
"""Create a minimal AgentRunResult-like namespace."""
defaults = dict(
final_content="step done",
messages=[],
tool_events=[],
stop_reason="completed",
tools_used=[],
)
defaults.update(overrides)
return SimpleNamespace(**defaults)
# ---------------------------------------------------------------------------
# LongTaskTool orchestrator tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_long_task_completes_in_one_step():
"""Subagent calls complete() immediately."""
from nanobot.agent.tools.long_task import LongTaskTool
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools):
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All done. Report in summary.md")
return _step_result(
final_content="All done.",
tools_used=["complete"],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Audit all issues.")
assert result == "All done. Report in summary.md"
@pytest.mark.asyncio
async def test_long_task_completes_after_multiple_handoffs():
"""Subagent calls handoff() twice then complete()."""
from nanobot.agent.tools.long_task import LongTaskTool
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Processed 1-8.")
elif call_count == 2:
assert "Processed 1-8." in user_message
assert "8 tool calls" in user_message
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Processed 9-16.")
else:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All 16 items audited.")
return _step_result(tools_used=["handoff"])
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Audit 16 issues.")
assert result == "All 16 items audited."
assert call_count == 3
@pytest.mark.asyncio
async def test_long_task_fallback_when_no_signal_called():
"""Subagent doesn't call handoff/complete — extract progress from messages."""
from nanobot.agent.tools.long_task import LongTaskTool
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools):
return _step_result(
final_content="Tool budget exhausted.",
messages=[
{"role": "system", "content": "..."},
{"role": "user", "content": "..."},
{"role": "assistant", "content": "I processed items 1-5. Results in out.md."},
{"role": "tool", "content": "ok"},
{"role": "assistant", "content": "Tool budget exhausted. Call handoff() earlier next time."},
],
stop_reason="max_iterations",
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Do something.", max_steps=2)
# Should reach max_steps and return the fallback extracted from messages
assert "max steps (2)" in result
assert "I processed items 1-5" in result
@pytest.mark.asyncio
async def test_long_task_goal_appears_in_system_prompt():
"""Verify every step's system_prompt contains the long task system prompt."""
from nanobot.agent.tools.long_task import LongTaskTool
mgr = _make_manager_stub()
captured_prompts = []
async def fake_run_step(*, system_prompt, user_message, extra_tools):
captured_prompts.append(system_prompt)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="done")
return _step_result(final_content="done")
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Audit everything.")
assert len(captured_prompts) == 1
assert "handoff()" in captured_prompts[0]
assert "complete()" in captured_prompts[0]
assert "filesystem" in captured_prompts[0]
# ---------------------------------------------------------------------------
# Helper function tests
# ---------------------------------------------------------------------------
def test_build_user_message_step_0():
from nanobot.agent.tools.long_task import _build_user_message
msg = _build_user_message("Audit all issues.", step=0, handoff="")
assert msg.startswith("Audit all issues.")
assert "Step 1" in msg
assert "8 tool calls" in msg
assert "Previous Progress" not in msg
def test_build_user_message_later_step():
from nanobot.agent.tools.long_task import _build_user_message
msg = _build_user_message("Audit all issues.", step=3, handoff="Did 1-10.")
assert "Audit all issues." in msg
assert "Previous Progress" in msg
assert "Did 1-10." in msg
assert "Step 4" in msg
assert "8 tool calls" in msg
def test_extract_handoff_from_messages():
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "do it"},
{"role": "assistant", "content": ""},
{"role": "tool", "content": "result"},
{"role": "assistant", "content": "I processed items 1-3."},
]
assert _extract_handoff_from_messages(messages) == "I processed items 1-3."
def test_extract_handoff_skips_budget_message():
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "do it"},
{"role": "assistant", "content": "I processed items 1-3."},
{"role": "tool", "content": "result"},
{"role": "assistant", "content": "Tool budget exhausted. Call handoff() earlier."},
]
# Should skip the budget message and find the actual progress
assert _extract_handoff_from_messages(messages) == "I processed items 1-3."
def test_extract_handoff_from_empty_messages():
from nanobot.agent.tools.long_task import _extract_handoff_from_messages
assert _extract_handoff_from_messages([]) == ""
assert _extract_handoff_from_messages([{"role": "system", "content": "sys"}]) == ""
# ---------------------------------------------------------------------------
# Integration: verify LongTaskTool is wired into the main agent loop
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_long_task_registered_in_tool_registry(tmp_path):
"""Verify LongTaskTool appears in the main agent's tool registry."""
from nanobot.agent.loop import AgentLoop
from nanobot.bus.queue import MessageBus
bus = MessageBus()
provider = MagicMock()
provider.get_default_model.return_value = "test-model"
loop = AgentLoop(bus=bus, provider=provider, workspace=tmp_path, model="test-model")
tool = loop.tools.get("long_task")
assert tool is not None
assert tool.name == "long_task"