nanobot/tests/agent/test_runner_reasoning.py
Xubin Ren 01fa362c03 Merge origin/main into feat/show-reasoning
Resolves conflicts after main landed the state-machine turn refactor
and the test_runner.py 9-file split:

- nanobot/agent/loop.py: take main's `_state_build`/`_persist_user_message_early`
  flow; restore the `reasoning: bool` parameter on `_build_bus_progress_callback`
  so the loop hook can mark progress as reasoning-channel without coupling to
  the answer stream.
- nanobot/cli/stream.py: keep main's configurable `bot_name`/`bot_icon` header
  while preserving the PR's `transient=True` Live + `self._console` routing
  + `_renderable()` final-render path that fixed TUI duplication.
- tests/agent/test_runner.py was deleted on main and split into 9 focused
  files; relocated all 6 reasoning tests into a new `test_runner_reasoning.py`
  matching the new layout, deduplicated the per-test `ReasoningHook` boilerplate
  through a shared `_RecordingHook` helper.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-13 05:07:14 +00:00

280 lines
9.5 KiB
Python

"""Tests for AgentRunner reasoning extraction and emission.
Covers the three sources of model reasoning (dedicated ``reasoning_content``,
Anthropic ``thinking_blocks``, inline ``<think>``/``<thought>`` tags) plus
the streaming interaction: reasoning and answer streams are independent
channels, gated by ``context.streamed_reasoning`` rather than
``context.streamed_content``.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from nanobot.agent.hook import AgentHook
from nanobot.config.schema import AgentDefaults
from nanobot.providers.base import LLMResponse, ToolCallRequest
_MAX_TOOL_RESULT_CHARS = AgentDefaults().max_tool_result_chars
class _RecordingHook(AgentHook):
def __init__(self) -> None:
super().__init__()
self.emitted: list[str] = []
async def emit_reasoning(self, reasoning_content: str | None) -> None:
if reasoning_content:
self.emitted.append(reasoning_content)
@pytest.mark.asyncio
async def test_runner_preserves_reasoning_fields_in_assistant_history():
"""Reasoning fields ride along on the persisted assistant message so
follow-up provider calls retain the model's prior thinking context."""
from nanobot.agent.runner import AgentRunSpec, AgentRunner
provider = MagicMock()
captured_second_call: list[dict] = []
call_count = {"n": 0}
async def chat_with_retry(*, messages, **kwargs):
call_count["n"] += 1
if call_count["n"] == 1:
return LLMResponse(
content="thinking",
tool_calls=[ToolCallRequest(id="call_1", name="list_dir", arguments={"path": "."})],
reasoning_content="hidden reasoning",
thinking_blocks=[{"type": "thinking", "thinking": "step"}],
usage={"prompt_tokens": 5, "completion_tokens": 3},
)
captured_second_call[:] = messages
return LLMResponse(content="done", tool_calls=[], usage={})
provider.chat_with_retry = chat_with_retry
tools = MagicMock()
tools.get_definitions.return_value = []
tools.execute = AsyncMock(return_value="tool result")
runner = AgentRunner(provider)
result = await runner.run(AgentRunSpec(
initial_messages=[
{"role": "system", "content": "system"},
{"role": "user", "content": "do task"},
],
tools=tools,
model="test-model",
max_iterations=3,
max_tool_result_chars=_MAX_TOOL_RESULT_CHARS,
))
assert result.final_content == "done"
assistant_messages = [
msg for msg in captured_second_call
if msg.get("role") == "assistant" and msg.get("tool_calls")
]
assert len(assistant_messages) == 1
assert assistant_messages[0]["reasoning_content"] == "hidden reasoning"
assert assistant_messages[0]["thinking_blocks"] == [{"type": "thinking", "thinking": "step"}]
@pytest.mark.asyncio
async def test_runner_emits_anthropic_thinking_blocks():
from nanobot.agent.runner import AgentRunSpec, AgentRunner
provider = MagicMock()
async def chat_with_retry(**kwargs):
return LLMResponse(
content="The answer is 42.",
thinking_blocks=[
{"type": "thinking", "thinking": "Let me analyze this step by step.", "signature": "sig1"},
{"type": "thinking", "thinking": "After careful consideration.", "signature": "sig2"},
],
tool_calls=[],
usage={"prompt_tokens": 5, "completion_tokens": 3},
)
provider.chat_with_retry = chat_with_retry
tools = MagicMock()
tools.get_definitions.return_value = []
hook = _RecordingHook()
runner = AgentRunner(provider)
result = await runner.run(AgentRunSpec(
initial_messages=[{"role": "user", "content": "question"}],
tools=tools,
model="test-model",
max_iterations=3,
max_tool_result_chars=_MAX_TOOL_RESULT_CHARS,
hook=hook,
))
assert result.final_content == "The answer is 42."
assert len(hook.emitted) == 1
assert "Let me analyze this" in hook.emitted[0]
assert "After careful consideration" in hook.emitted[0]
@pytest.mark.asyncio
async def test_runner_emits_inline_think_content_as_reasoning():
"""Models embedding reasoning in <think>...</think> blocks should have
that content extracted and emitted, and stripped from the answer."""
from nanobot.agent.runner import AgentRunSpec, AgentRunner
provider = MagicMock()
async def chat_with_retry(**kwargs):
return LLMResponse(
content="<think>Let me think about this...\nThe answer is 42.</think>The answer is 42.",
tool_calls=[],
usage={"prompt_tokens": 5, "completion_tokens": 3},
)
provider.chat_with_retry = chat_with_retry
tools = MagicMock()
tools.get_definitions.return_value = []
hook = _RecordingHook()
runner = AgentRunner(provider)
result = await runner.run(AgentRunSpec(
initial_messages=[{"role": "user", "content": "what is the answer?"}],
tools=tools,
model="test-model",
max_iterations=3,
max_tool_result_chars=_MAX_TOOL_RESULT_CHARS,
hook=hook,
))
assert result.final_content == "The answer is 42."
assert len(hook.emitted) == 1
assert "Let me think about this" in hook.emitted[0]
@pytest.mark.asyncio
async def test_runner_prefers_reasoning_content_over_inline_think():
"""Fallback priority: dedicated reasoning_content wins; inline <think>
is still scrubbed from the answer content."""
from nanobot.agent.runner import AgentRunSpec, AgentRunner
provider = MagicMock()
async def chat_with_retry(**kwargs):
return LLMResponse(
content="<think>inline thinking</think>The answer.",
reasoning_content="dedicated reasoning field",
tool_calls=[],
usage={"prompt_tokens": 5, "completion_tokens": 3},
)
provider.chat_with_retry = chat_with_retry
tools = MagicMock()
tools.get_definitions.return_value = []
hook = _RecordingHook()
runner = AgentRunner(provider)
result = await runner.run(AgentRunSpec(
initial_messages=[{"role": "user", "content": "question"}],
tools=tools,
model="test-model",
max_iterations=3,
max_tool_result_chars=_MAX_TOOL_RESULT_CHARS,
hook=hook,
))
assert result.final_content == "The answer."
assert hook.emitted == ["dedicated reasoning field"]
@pytest.mark.asyncio
async def test_runner_emits_reasoning_content_even_when_answer_was_streamed():
"""`reasoning_content` arrives only on the final response; streaming the
answer must not suppress it (the answer stream and the reasoning channel
are independent — only the reasoning-already-emitted bit matters)."""
from nanobot.agent.runner import AgentRunSpec, AgentRunner
provider = MagicMock()
provider.supports_progress_deltas = True
async def chat_stream_with_retry(*, on_content_delta=None, **kwargs):
if on_content_delta:
await on_content_delta("The ")
await on_content_delta("answer.")
return LLMResponse(
content="The answer.",
reasoning_content="step-by-step deduction",
tool_calls=[],
usage={"prompt_tokens": 5, "completion_tokens": 3},
)
provider.chat_stream_with_retry = chat_stream_with_retry
tools = MagicMock()
tools.get_definitions.return_value = []
progress_calls: list[str] = []
async def _progress(content: str, **_kwargs):
progress_calls.append(content)
hook = _RecordingHook()
runner = AgentRunner(provider)
result = await runner.run(AgentRunSpec(
initial_messages=[{"role": "user", "content": "question"}],
tools=tools,
model="test-model",
max_iterations=3,
max_tool_result_chars=_MAX_TOOL_RESULT_CHARS,
hook=hook,
stream_progress_deltas=True,
progress_callback=_progress,
))
assert result.final_content == "The answer."
assert progress_calls, "answer should have streamed via progress callback"
assert hook.emitted == ["step-by-step deduction"]
@pytest.mark.asyncio
async def test_runner_does_not_double_emit_when_inline_think_already_streamed():
"""Inline `<think>` blocks streamed incrementally during the answer
stream must not be re-emitted from the final response."""
from nanobot.agent.runner import AgentRunSpec, AgentRunner
provider = MagicMock()
provider.supports_progress_deltas = True
async def chat_stream_with_retry(*, on_content_delta=None, **kwargs):
if on_content_delta:
await on_content_delta("<think>working...</think>")
await on_content_delta("The answer.")
return LLMResponse(
content="<think>working...</think>The answer.",
tool_calls=[],
usage={"prompt_tokens": 5, "completion_tokens": 3},
)
provider.chat_stream_with_retry = chat_stream_with_retry
tools = MagicMock()
tools.get_definitions.return_value = []
async def _progress(content: str, **_kwargs):
pass
hook = _RecordingHook()
runner = AgentRunner(provider)
result = await runner.run(AgentRunSpec(
initial_messages=[{"role": "user", "content": "question"}],
tools=tools,
model="test-model",
max_iterations=3,
max_tool_result_chars=_MAX_TOOL_RESULT_CHARS,
hook=hook,
stream_progress_deltas=True,
progress_callback=_progress,
))
assert result.final_content == "The answer."
assert hook.emitted == ["working..."]