nanobot/tests/agent/test_consolidator.py
chengyongru 93bcb0a649 fix(agent): prevent history.jsonl bloat from raw_archive and stuck consolidation
Root cause: when consolidation LLM fails, raw_archive() dumped full message
content (~1MB) into history.jsonl with no size limit. Since build_system_prompt()
injects history.jsonl into every system prompt, all subsequent LLM calls exceeded
the 200K context window with error 1261.

Additionally, _cap_consolidation_boundary's 60-message cap caused consolidation
to get stuck on sessions with long tool chains (200+ iterations), triggering
the raw_archive fallback in the first place.

Three-layer fix:
- Remove _cap_consolidation_boundary: let pick_consolidation_boundary drive
  chunk sizing based solely on token budget
- Truncate archive() input: use tiktoken to cap formatted text to the model's
  input token budget before sending to consolidation LLM
- Truncate raw_archive() output: cap history.jsonl entries at 16K chars
2026-04-24 01:41:01 +08:00

233 lines
9.7 KiB
Python

"""Tests for the lightweight Consolidator — append-only to HISTORY.md."""
import pytest
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
from nanobot.agent.memory import Consolidator, MemoryStore, _RAW_ARCHIVE_MAX_CHARS
@pytest.fixture
def store(tmp_path):
return MemoryStore(tmp_path)
@pytest.fixture
def mock_provider():
p = MagicMock()
p.chat_with_retry = AsyncMock()
return p
@pytest.fixture
def consolidator(store, mock_provider):
sessions = MagicMock()
sessions.save = MagicMock()
return Consolidator(
store=store,
provider=mock_provider,
model="test-model",
sessions=sessions,
context_window_tokens=1000,
build_messages=MagicMock(return_value=[]),
get_tool_definitions=MagicMock(return_value=[]),
max_completion_tokens=100,
)
class TestConsolidatorSummarize:
async def test_summarize_appends_to_history(self, consolidator, mock_provider, store):
"""Consolidator should call LLM to summarize, then append to HISTORY.md."""
mock_provider.chat_with_retry.return_value = MagicMock(
content="User fixed a bug in the auth module."
)
messages = [
{"role": "user", "content": "fix the auth bug"},
{"role": "assistant", "content": "Done, fixed the race condition."},
]
result = await consolidator.archive(messages)
assert result == "User fixed a bug in the auth module."
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries) == 1
async def test_summarize_raw_dumps_on_llm_failure(self, consolidator, mock_provider, store):
"""On LLM failure, raw-dump messages to HISTORY.md."""
mock_provider.chat_with_retry.side_effect = Exception("API error")
messages = [{"role": "user", "content": "hello"}]
result = await consolidator.archive(messages)
assert result is None # no summary on raw dump fallback
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries) == 1
assert "[RAW]" in entries[0]["content"]
async def test_summarize_skips_empty_messages(self, consolidator):
result = await consolidator.archive([])
assert result is None
class TestConsolidatorArchiveErrorHandling:
"""archive() must fall back to raw_archive when the LLM returns an error
response (finish_reason == 'error'), e.g. overloaded / quota exceeded.
See https://github.com/HKUDS/nanobot/issues/3244
"""
async def test_archive_falls_back_on_error_finish_reason(self, consolidator, mock_provider, store):
"""LLM returning finish_reason='error' should trigger raw_archive, not write error text."""
mock_provider.chat_with_retry.return_value = MagicMock(
content="Error: {'type': 'error', 'error': {'type': 'overloaded_error', 'message': 'overloaded_error (529)'}}",
finish_reason="error",
)
messages = [
{"role": "user", "content": "fix the auth bug"},
{"role": "assistant", "content": "Done, fixed the race condition."},
]
result = await consolidator.archive(messages)
assert result is None
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries) == 1
assert "[RAW]" in entries[0]["content"]
assert "Error:" not in entries[0]["content"]
async def test_archive_preserves_summary_on_success(self, consolidator, mock_provider, store):
"""Normal LLM response should still produce a proper summary entry."""
mock_provider.chat_with_retry.return_value = MagicMock(
content="User fixed a bug in the auth module.",
finish_reason="stop",
)
messages = [
{"role": "user", "content": "fix the auth bug"},
{"role": "assistant", "content": "Done."},
]
result = await consolidator.archive(messages)
assert result == "User fixed a bug in the auth module."
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries) == 1
assert "[RAW]" not in entries[0]["content"]
class TestConsolidatorTokenBudget:
async def test_prompt_below_threshold_does_not_consolidate(self, consolidator):
"""No consolidation when tokens are within budget."""
session = MagicMock()
session.last_consolidated = 0
session.messages = [{"role": "user", "content": "hi"}]
session.key = "test:key"
consolidator.estimate_session_prompt_tokens = MagicMock(return_value=(100, "tiktoken"))
consolidator.archive = AsyncMock(return_value=True)
await consolidator.maybe_consolidate_by_tokens(session)
consolidator.archive.assert_not_called()
async def test_large_chunk_archived_without_cap(self, consolidator):
"""Without chunk cap, the full range from pick_consolidation_boundary is archived."""
consolidator._SAFETY_BUFFER = 0
session = MagicMock()
session.last_consolidated = 0
session.key = "test:key"
session.messages = [
{
"role": "user" if i in {0, 50, 61} else "assistant",
"content": f"m{i}",
}
for i in range(70)
]
consolidator.estimate_session_prompt_tokens = MagicMock(
side_effect=[(1200, "tiktoken"), (400, "tiktoken")]
)
# Use real pick_consolidation_boundary — it will find boundary at idx=50
# (user message at 50, token budget met)
consolidator.archive = AsyncMock(return_value=True)
await consolidator.maybe_consolidate_by_tokens(session)
archived_chunk = consolidator.archive.await_args.args[0]
# pick_consolidation_boundary returns (50, tokens) — user turn at idx 50
assert archived_chunk[0]["content"] == "m0"
assert session.last_consolidated > 0
async def test_boundary_respected_when_no_intermediate_user_turn(self, consolidator):
"""When boundary points past a long tool chain, the full chunk is archived."""
consolidator._SAFETY_BUFFER = 0
session = MagicMock()
session.last_consolidated = 0
session.key = "test:key"
session.messages = [
{
"role": "user" if i in {0, 61} else "assistant",
"content": f"m{i}",
}
for i in range(70)
]
consolidator.estimate_session_prompt_tokens = MagicMock(
side_effect=[(1200, "tiktoken"), (400, "tiktoken")]
)
consolidator.archive = AsyncMock(return_value=True)
await consolidator.maybe_consolidate_by_tokens(session)
consolidator.archive.assert_awaited_once()
# pick_consolidation_boundary finds the only boundary at idx=61
assert session.last_consolidated == 61
class TestRawArchiveTruncation:
"""raw_archive() must cap entry size to avoid bloating history.jsonl."""
def test_raw_archive_truncates_large_content(self, store):
"""Large messages should be truncated to _RAW_ARCHIVE_MAX_CHARS."""
big = "x" * 50_000
messages = [{"role": "user", "content": big}]
store.raw_archive(messages)
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries) == 1
assert len(entries[0]["content"]) < 50_000
assert "[RAW]" in entries[0]["content"]
def test_raw_archive_preserves_small_content(self, store):
"""Small messages should not be truncated."""
messages = [{"role": "user", "content": "hello"}]
store.raw_archive(messages)
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries) == 1
assert "hello" in entries[0]["content"]
def test_raw_archive_custom_max_chars(self, store):
"""max_chars parameter should override default limit."""
messages = [{"role": "user", "content": "a" * 200}]
store.raw_archive(messages, max_chars=100)
entries = store.read_unprocessed_history(since_cursor=0)
assert len(entries[0]["content"]) < 200
class TestArchiveTruncation:
"""archive() must truncate formatted text before sending to consolidation LLM."""
async def test_archive_truncates_large_formatted_text(self, consolidator, mock_provider, store):
"""Large formatted text should be truncated to token budget before LLM call."""
# context_window_tokens=1000, max_completion_tokens=100, _SAFETY_BUFFER=1024
# budget = 1000 - 100 - 1024 = -124 → fallback via truncate_text(budget*4)
big_messages = [{"role": "user", "content": "x" * 100_000}]
mock_provider.chat_with_retry.return_value = MagicMock(
content="Summary of large input.", finish_reason="stop"
)
await consolidator.archive(big_messages)
call_args = mock_provider.chat_with_retry.call_args
user_content = call_args.kwargs["messages"][1]["content"]
# Should be significantly shorter than 100K
assert len(user_content) < 50_000
async def test_archive_truncates_with_small_token_budget(self, consolidator, mock_provider, store):
"""Small context window: truncation uses actual tokenizer count."""
consolidator.context_window_tokens = 500
big_messages = [{"role": "user", "content": "word " * 50_000}]
mock_provider.chat_with_retry.return_value = MagicMock(
content="Summary.", finish_reason="stop"
)
await consolidator.archive(big_messages)
sent_messages = mock_provider.chat_with_retry.call_args.kwargs["messages"]
user_content = sent_messages[1]["content"]
# budget = 500 - 100 - 1024 = negative, fallback char-based
# Should be truncated
assert len(user_content) < 250_000