nanobot/tests/agent/tools/test_long_task.py
Xubin Ren 7c29a738a5 test(long-task): expect wrapped completion message after validation
Align assertions with LongTaskTool final return shape on main.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-13 17:45:06 +00:00

889 lines
32 KiB
Python

"""Tests for Long Task Tool: HandoffTool, CompleteTool, LongTaskTool."""
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock
import pytest
from nanobot.agent.tools.long_task import (
CompleteTool,
HandoffState,
HandoffTool,
LongTaskTool,
_build_system_prompt,
_build_user_message,
_extract_file_changes,
_extract_handoff_from_messages,
)
_COMPLETION_PREFIX = (
"The task is complete. This is the final answer — "
"present it to the user directly without calling additional "
"tools or reading files.\n\n"
)
def _wrapped_completion(summary: str) -> str:
"""Matches LongTaskTool return shape after successful validation."""
return f"{_COMPLETION_PREFIX}{summary}"
# ---------------------------------------------------------------------------
# Signal tool tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_handoff_tool_stores_structured_signal():
store = HandoffState()
tool = HandoffTool(store)
result = await tool.execute(
message="Processed items 1-8. Results in out.md.",
files_created=["out.md", "report.md"],
files_modified=["main.py"],
next_step_hint="Continue with item 9.",
verification="Tests passed",
)
assert result == "Progress recorded. The next step will continue from here."
assert store.signal_type == "handoff"
assert store.message == "Processed items 1-8. Results in out.md."
assert store.files_created == ["out.md", "report.md"]
assert store.files_modified == ["main.py"]
assert store.next_step_hint == "Continue with item 9."
assert store.verification == "Tests passed"
@pytest.mark.asyncio
async def test_handoff_tool_defaults_optional_fields():
store = HandoffState()
tool = HandoffTool(store)
await tool.execute(message="Done.")
assert store.files_created == []
assert store.files_modified == []
assert store.next_step_hint == ""
assert store.verification == ""
@pytest.mark.asyncio
async def test_complete_tool_stores_signal():
store = HandoffState()
tool = CompleteTool(store)
result = await tool.execute(summary="All 100 items processed. Summary in report.md")
assert result == "Task marked as complete. Awaiting validation."
assert store.signal_type == "complete"
assert store.message == "All 100 items processed. Summary in report.md"
@pytest.mark.asyncio
async def test_signal_tools_overwrite_on_multiple_calls():
"""Last call wins — the orchestrator only reads the final signal."""
store = HandoffState()
handoff = HandoffTool(store)
complete = CompleteTool(store)
await handoff.execute(message="first progress")
assert store.message == "first progress"
await complete.execute(summary="done early")
assert store.signal_type == "complete"
assert store.message == "done early"
# ---------------------------------------------------------------------------
# Helper: minimal SubagentManager stub
# ---------------------------------------------------------------------------
def _make_manager_stub():
"""Create a minimal SubagentManager stub with a mockable run_step."""
mgr = MagicMock()
mgr.run_step = AsyncMock()
return mgr
def _step_result(**overrides):
"""Create a minimal AgentRunResult-like namespace."""
defaults = dict(
final_content="step done",
messages=[],
tool_events=[],
stop_reason="completed",
tools_used=[],
usage={},
)
defaults.update(overrides)
return SimpleNamespace(**defaults)
# ---------------------------------------------------------------------------
# LongTaskTool orchestrator tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_long_task_completes_in_one_step():
"""Subagent calls complete() immediately, validation passes."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All done. Report in summary.md")
return _step_result(
final_content="All done.",
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Validation round
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Audit all issues.")
assert result == _wrapped_completion("All done. Report in summary.md")
assert call_count == 2 # main step + validation
@pytest.mark.asyncio
async def test_long_task_completes_after_multiple_handoffs():
"""Subagent calls handoff() twice then complete(), validation passes."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Processed 1-8.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
elif call_count == 2:
assert "Processed 1-8." in user_message
assert "Step 2 of 20" in user_message
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Processed 9-16.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
elif call_count == 3:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All 16 items audited.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Validation round
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Audit 16 issues.")
assert result == _wrapped_completion("All 16 items audited.")
assert call_count == 4 # 3 main steps + validation
@pytest.mark.asyncio
async def test_long_task_uses_last_signal_when_multiple_signals_called():
"""If a step calls handoff() then complete(), complete() should win."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Partial progress.")
elif t.name == "complete":
await t.execute(summary="Actually complete.")
return _step_result(
tools_used=["handoff", "complete"],
tool_events=[
{"name": "handoff", "status": "ok", "detail": ""},
{"name": "complete", "status": "ok", "detail": ""},
],
)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Do something.", max_steps=1)
assert result == _wrapped_completion("Actually complete.")
assert call_count == 2 # main step + validation
@pytest.mark.asyncio
async def test_long_task_validation_falls_back_to_handoff():
"""Subagent claims complete but validation fails — task continues."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
# First step claims complete
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
elif call_count == 2:
# Validation round fails (handoff called)
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Not actually done. Need more work.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
elif call_count == 3:
# Continue and complete for real
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Really done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Second validation passes
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Do something.", max_steps=5)
assert result == _wrapped_completion("Really done.")
assert call_count == 4
@pytest.mark.asyncio
async def test_long_task_fallback_when_no_signal_called():
"""Subagent doesn't call handoff/complete — extract progress from messages."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
return _step_result(
final_content="Tool budget exhausted.",
messages=[
{"role": "system", "content": "..."},
{"role": "user", "content": "..."},
{"role": "assistant", "content": "I processed items 1-5. Results in out.md."},
{"role": "tool", "content": "ok"},
{"role": "assistant", "content": "Tool budget exhausted. Call handoff() earlier next time."},
],
stop_reason="max_iterations",
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Do something.", max_steps=2)
# Should reach max_steps and return the fallback extracted from messages
assert "max steps (2)" in result
assert "I processed items 1-5" in result
@pytest.mark.asyncio
async def test_long_task_auto_extracts_on_natural_end():
"""Subagent finishes naturally (stop_reason=completed) without calling signal."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
return _step_result(
final_content="I processed items 1-5. Results in out.md.",
stop_reason="completed",
)
elif call_count == 2:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="All done.")
return _step_result(
final_content="All done.",
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
else:
# Validation
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Validated")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Process items.", max_steps=5)
assert result == _wrapped_completion("All done.")
assert call_count == 3
@pytest.mark.asyncio
async def test_long_task_retries_on_crash():
"""A step that crashes once should be retried."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
raise RuntimeError("Simulated crash")
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Recovered.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Test retry.")
assert result == _wrapped_completion("Recovered.")
assert call_count == 3 # main step + retry + validation
@pytest.mark.asyncio
async def test_long_task_fails_after_two_crashes():
"""A step that crashes twice should terminate the task."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
raise RuntimeError("Persistent crash")
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Test failure.", max_steps=3)
assert "failed at step 1/3" in result
assert tool.status == "error"
@pytest.mark.asyncio
async def test_long_task_uses_dynamic_budget():
"""Final steps should use lower max_iterations."""
mgr = _make_manager_stub()
captured_budgets = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_budgets.append(max_iterations)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Test budget.", max_steps=5)
# Step 0-2 should use 8, step 3+ should use 4
# But we complete on step 0, so only one budget captured
assert captured_budgets[0] == 8
# ---------------------------------------------------------------------------
# Hook and observability tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_hooks_receive_events():
"""Registered hooks should be called during execution."""
mgr = _make_manager_stub()
events = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.set_hooks({
"on_task_start": lambda **kw: events.append(("task_start", kw)),
"on_step_start": lambda **kw: events.append(("step_start", kw)),
"on_step_complete": lambda **kw: events.append(("step_complete", kw)),
"on_validation_started": lambda **kw: events.append(("validation_started", kw)),
"on_task_complete": lambda **kw: events.append(("task_complete", kw)),
})
await tool.execute(goal="Test hooks.")
assert any(e[0] == "task_start" for e in events)
assert any(e[0] == "step_start" for e in events)
assert any(e[0] == "step_complete" for e in events)
assert any(e[0] == "validation_started" for e in events)
assert any(e[0] == "task_complete" for e in events)
@pytest.mark.asyncio
async def test_catch_all_hook_receives_events():
"""The on_event catch-all hook should receive all events."""
mgr = _make_manager_stub()
events = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.set_hooks({
"on_event": lambda ev: events.append(ev.type),
})
await tool.execute(goal="Test catch-all.")
assert "task_start" in events
assert "step_start" in events
assert "task_complete" in events
@pytest.mark.asyncio
async def test_state_exposure():
"""Properties should reflect current execution state."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Progress.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
assert tool.status == "idle"
await tool.execute(goal="Test state.", max_steps=3)
assert tool.goal == "Test state."
assert tool.total_steps == 3
assert tool.status == "error" # max_steps reached
assert tool.last_handoff.message == "Progress."
@pytest.mark.asyncio
async def test_inject_correction():
"""User correction should appear in the next step's user message."""
mgr = _make_manager_stub()
captured_messages = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_messages.append(user_message)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.inject_correction("Focus on error handling.")
await tool.execute(goal="Refactor code.")
assert any("Focus on error handling" in msg for msg in captured_messages)
# ---------------------------------------------------------------------------
# Helper function tests
# ---------------------------------------------------------------------------
def test_build_system_prompt():
prompt = _build_system_prompt(budget=8)
assert "handoff()" in prompt
assert "complete()" in prompt
assert "8 tool calls" in prompt
def test_build_user_message_step_0():
msg = _build_user_message("Audit all issues.", step=0, max_steps=20, handoff=HandoffState())
assert "Audit all issues." in msg
assert "Step 1 of 20" in msg
assert "8 tool calls" in msg
assert "Previous Progress" not in msg
def test_build_user_message_later_step():
handoff = HandoffState(message="Did 1-10.", files_created=["a.py"], next_step_hint="Do Y")
msg = _build_user_message("Audit all issues.", step=3, max_steps=20, handoff=handoff)
assert "Audit all issues." in msg
assert "Previous Progress" in msg
assert "Did 1-10." in msg
assert "a.py" in msg
assert "Do Y" in msg
assert "Step 4 of 20" in msg
def test_build_user_message_final_step():
handoff = HandoffState(message="Almost done.")
msg = _build_user_message("Audit all issues.", step=18, max_steps=20, handoff=handoff)
assert "FINAL Step" in msg
assert "4 tool calls" in msg # final steps use lower budget
def test_build_user_message_with_correction():
msg = _build_user_message(
"Audit.", step=0, max_steps=20, handoff=HandoffState(), correction="Skip file A"
)
assert "User Correction" in msg
assert "Skip file A" in msg
def test_extract_handoff_from_messages():
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "do it"},
{"role": "assistant", "content": ""},
{"role": "tool", "content": "result"},
{"role": "assistant", "content": "I processed items 1-3."},
]
assert _extract_handoff_from_messages(messages) == "I processed items 1-3."
def test_extract_handoff_skips_budget_message():
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "do it"},
{"role": "assistant", "content": "I processed items 1-3."},
{"role": "tool", "content": "result"},
{"role": "assistant", "content": "Tool budget exhausted. Call handoff() earlier."},
]
# Should skip the budget message and find the actual progress
assert _extract_handoff_from_messages(messages) == "I processed items 1-3."
def test_extract_handoff_from_empty_messages():
assert _extract_handoff_from_messages([]) == ""
assert _extract_handoff_from_messages([{"role": "system", "content": "sys"}]) == ""
def test_extract_file_changes_from_tool_events():
events = [
{
"name": "write_file",
"status": "ok",
"detail": "Successfully wrote 12 characters to /workspace/a.py",
},
{
"name": "edit_file",
"status": "ok",
"detail": "Successfully edited /workspace/b.py",
},
{
"name": "edit_file",
"status": "ok",
"detail": "Successfully created /workspace/c.py",
},
{"name": "read_file", "status": "ok", "detail": "Read /workspace/c.py"},
{"name": "write_file", "status": "error", "detail": "Failed"},
]
created, modified = _extract_file_changes(events)
assert created == ["/workspace/a.py"]
assert modified == ["/workspace/b.py", "/workspace/c.py"]
def test_extract_file_changes_empty():
assert _extract_file_changes([]) == ([], [])
# ---------------------------------------------------------------------------
# Boundary and edge-case tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_validation_step_crash_continues_task():
"""If the validation round itself crashes, the task should continue, not die."""
mgr = _make_manager_stub()
call_count = 0
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
nonlocal call_count
call_count += 1
if call_count == 1:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Claimed done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
elif call_count == 2:
# Validation round crashes
raise RuntimeError("Validator exploded")
else:
# After validation crash, task continues and completes for real
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Actually done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
result = await tool.execute(goal="Test validation crash.", max_steps=5)
assert result == _wrapped_completion("Actually done.")
assert call_count == 4 # main step + validation crash + continue + re-validation
@pytest.mark.asyncio
async def test_hook_exception_does_not_stop_execution():
"""A misbehaving hook must not break the long-task execution."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.set_hooks({
"on_step_start": lambda **kw: (_ for _ in ()).throw(RuntimeError("bad hook")),
"on_event": lambda ev: (_ for _ in ()).throw(RuntimeError("bad catch-all")),
})
result = await tool.execute(goal="Test hook resilience.")
assert result == _wrapped_completion("Done.")
@pytest.mark.asyncio
async def test_inject_correction_during_execution():
"""Correction injected while the task is running should reach the next step."""
import asyncio
mgr = _make_manager_stub()
step1_done = asyncio.Event()
captured_messages = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_messages.append(user_message)
if len(captured_messages) == 1:
for t in extra_tools:
if t.name == "handoff":
await t.execute(message="Step 1 done.")
step1_done.set()
await asyncio.sleep(0) # yield so the test coroutine can inject correction
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
else:
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Step 2 done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
task = asyncio.create_task(tool.execute(goal="Test mid-run correction.", max_steps=5))
await step1_done.wait()
tool.inject_correction("Change direction.")
result = await task
assert result == _wrapped_completion("Step 2 done.")
assert any("Change direction." in msg for msg in captured_messages)
@pytest.mark.asyncio
async def test_multiple_corrections_consumed_in_order():
"""Multiple corrections should be consumed FIFO across steps."""
mgr = _make_manager_stub()
captured_messages = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_messages.append(user_message)
for t in extra_tools:
if t.name == "handoff":
await t.execute(message=f"Step {len(captured_messages)} done.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
tool.inject_correction("First correction.")
tool.inject_correction("Second correction.")
await tool.execute(goal="Test FIFO.", max_steps=3)
# Step 0 should see First correction, step 1 should see Second
assert any("First correction." in msg for msg in captured_messages)
assert any("Second correction." in msg for msg in captured_messages)
# Verify ordering: First appears in an earlier index than Second
first_idx = next(i for i, msg in enumerate(captured_messages) if "First correction." in msg)
second_idx = next(i for i, msg in enumerate(captured_messages) if "Second correction." in msg)
assert first_idx < second_idx
@pytest.mark.asyncio
async def test_explicit_file_changes_override_auto_detected():
"""If subagent explicitly reports files_created, auto-detection must not overwrite."""
mgr = _make_manager_stub()
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
for t in extra_tools:
if t.name == "handoff":
await t.execute(
message="Done.",
files_created=["explicit.py"],
)
return _step_result(
tools_used=["handoff"],
tool_events=[
{"name": "handoff", "status": "ok", "detail": ""},
# Auto-detection would pick this up as "auto.py"
{
"name": "write_file",
"status": "ok",
"detail": "Successfully wrote 7 characters to auto.py",
},
],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Test file merge.", max_steps=2)
# Explicit report should win over auto-detection
assert tool.last_handoff.files_created == ["explicit.py"]
@pytest.mark.asyncio
async def test_max_steps_one_uses_final_budget():
"""max_steps=1 should immediately use the final-step budget of 4."""
mgr = _make_manager_stub()
captured_budgets = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_budgets.append(max_iterations)
for t in extra_tools:
if t.name == "complete":
await t.execute(summary="Done.")
return _step_result(
tools_used=["complete"],
tool_events=[{"name": "complete", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Test max_steps=1.", max_steps=1)
assert captured_budgets[0] == 4 # final step budget
assert captured_budgets[1] == 4 # validation also uses short budget
@pytest.mark.asyncio
async def test_budget_switches_at_correct_step():
"""With max_steps=5, budget should switch from 8 to 4 at step 3 (max_steps - 2)."""
mgr = _make_manager_stub()
captured_budgets = []
async def fake_run_step(*, system_prompt, user_message, extra_tools, max_iterations=None):
captured_budgets.append(max_iterations)
for t in extra_tools:
if t.name == "handoff":
await t.execute(message=f"Step {len(captured_budgets)}.")
return _step_result(
tools_used=["handoff"],
tool_events=[{"name": "handoff", "status": "ok", "detail": ""}],
)
mgr.run_step.side_effect = fake_run_step
tool = LongTaskTool(manager=mgr)
await tool.execute(goal="Test budget switch.", max_steps=5)
# Steps 0,1,2 use 8; steps 3,4 use 4
assert captured_budgets[0] == 8
assert captured_budgets[1] == 8
assert captured_budgets[2] == 8
assert captured_budgets[3] == 4
assert captured_budgets[4] == 4
# ---------------------------------------------------------------------------
# Integration: verify LongTaskTool is wired into the main agent loop
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_long_task_registered_in_tool_registry(tmp_path):
"""Verify LongTaskTool appears in the main agent's tool registry."""
from nanobot.agent.loop import AgentLoop
from nanobot.bus.queue import MessageBus
bus = MessageBus()
provider = MagicMock()
provider.get_default_model.return_value = "test-model"
loop = AgentLoop(bus=bus, provider=provider, workspace=tmp_path, model="test-model")
tool = loop.tools.get("long_task")
assert tool is not None
assert tool.name == "long_task"