nanobot/tests/heartbeat/test_heartbeat_deliverability.py
hussein1362 e72c415473 fix(heartbeat): prevent internal reasoning leaks and finalization fallback in delivery
Three failure modes addressed:

1. Model reflects HEARTBEAT.md instructions back as output instead of
   executing them ("HEARTBEAT.md has active tasks listed...")
2. Model narrates decision logic ("Best judgment call: stay quiet")
3. Model produces empty output for silence, runner treats it as failure,
   finalization retry generates "couldn't produce a final answer" which
   gets delivered to the user

Changes:
- Add _is_deliverable() pre-filter in HeartbeatService._tick() that catches
  finalization fallback messages and leaked reasoning patterns before they
  reach the evaluator
- Wrap Phase 2 task input with a delivery-awareness preamble telling the
  model its output goes directly to the user's messaging app
- Add meta-reasoning suppression criterion to evaluator template

No changes to agent/loop.py, runner.py, providers, or config schema.
2026-04-27 18:14:13 +08:00

231 lines
7.5 KiB
Python

"""Tests for HeartbeatService._is_deliverable and _tick suppression."""
import pytest
from nanobot.heartbeat.service import HeartbeatService
from nanobot.providers.base import LLMResponse, ToolCallRequest
# ---------------------------------------------------------------------------
# _is_deliverable unit tests
# ---------------------------------------------------------------------------
class TestIsDeliverable:
"""Verify the pre-evaluator deliverability filter."""
def test_normal_report_is_deliverable(self):
assert HeartbeatService._is_deliverable(
"2 new emails — invoice from Zain, meeting rescheduled to 3pm."
)
def test_short_dismissal_is_deliverable(self):
assert HeartbeatService._is_deliverable("All clear.")
def test_finalization_fallback_blocked(self):
assert not HeartbeatService._is_deliverable(
"I completed the tool steps but couldn't produce a final answer. "
"Please try again or narrow the task."
)
def test_leaked_heartbeat_md_reference_blocked(self):
assert not HeartbeatService._is_deliverable(
"Yes — HEARTBEAT.md has active tasks listed. They are: "
"Check Gmail for important messages, Check Calendar."
)
def test_leaked_awareness_md_reference_blocked(self):
assert not HeartbeatService._is_deliverable(
"I reviewed AWARENESS.md and found no new signals."
)
def test_leaked_judgment_call_blocked(self):
assert not HeartbeatService._is_deliverable(
"Best judgment call: stay quiet."
)
def test_leaked_decision_logic_blocked(self):
assert not HeartbeatService._is_deliverable(
"Strict HEARTBEAT interpretation. Decision logic says SHORT UPDATE."
)
def test_leaked_valid_options_blocked(self):
assert not HeartbeatService._is_deliverable(
"The valid options are FULL REPORT, SHORT UPDATE, or SILENT."
)
def test_leaked_my_instructions_blocked(self):
assert not HeartbeatService._is_deliverable(
"My instructions say to check Gmail and Calendar."
)
def test_leaked_supposed_to_blocked(self):
assert not HeartbeatService._is_deliverable(
"I am supposed to scan for urgent emails."
)
def test_case_insensitive(self):
assert not HeartbeatService._is_deliverable(
"HEARTBEAT.MD has tasks listed."
)
def test_empty_string_is_deliverable(self):
"""Empty string won't reach _is_deliverable in practice (caught earlier),
but should not crash."""
assert HeartbeatService._is_deliverable("")
# ---------------------------------------------------------------------------
# _tick integration: non-deliverable responses never reach evaluator/notify
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_tick_suppresses_finalization_fallback(tmp_path, monkeypatch) -> None:
"""Finalization fallback should be caught before the evaluator runs."""
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check inbox", encoding="utf-8")
from nanobot.providers.base import LLMProvider
class StubProvider(LLMProvider):
async def chat(self, **kwargs) -> LLMResponse:
return LLMResponse(
content="",
tool_calls=[
ToolCallRequest(
id="hb_1", name="heartbeat",
arguments={"action": "run", "tasks": "check inbox"},
)
],
)
def get_default_model(self) -> str:
return "test-model"
notified: list[str] = []
evaluator_called = False
async def _on_execute(tasks: str) -> str:
return (
"I completed the tool steps but couldn't produce a final answer. "
"Please try again or narrow the task."
)
async def _on_notify(response: str) -> None:
notified.append(response)
async def _eval_always_notify(*a, **kw):
nonlocal evaluator_called
evaluator_called = True
return True
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_always_notify)
service = HeartbeatService(
workspace=tmp_path,
provider=StubProvider(),
model="test-model",
on_execute=_on_execute,
on_notify=_on_notify,
)
await service._tick()
assert notified == [], "Finalization fallback should not reach the user"
assert not evaluator_called, "Evaluator should not be called for non-deliverable responses"
@pytest.mark.asyncio
async def test_tick_suppresses_leaked_reasoning(tmp_path, monkeypatch) -> None:
"""Leaked internal reasoning should be caught before the evaluator runs."""
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check status", encoding="utf-8")
from nanobot.providers.base import LLMProvider
class StubProvider(LLMProvider):
async def chat(self, **kwargs) -> LLMResponse:
return LLMResponse(
content="",
tool_calls=[
ToolCallRequest(
id="hb_1", name="heartbeat",
arguments={"action": "run", "tasks": "check status"},
)
],
)
def get_default_model(self) -> str:
return "test-model"
notified: list[str] = []
async def _on_execute(tasks: str) -> str:
return "HEARTBEAT.md has active tasks listed. They are: Check Gmail."
async def _on_notify(response: str) -> None:
notified.append(response)
async def _eval_always_notify(*a, **kw):
return True
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_always_notify)
service = HeartbeatService(
workspace=tmp_path,
provider=StubProvider(),
model="test-model",
on_execute=_on_execute,
on_notify=_on_notify,
)
await service._tick()
assert notified == [], "Leaked reasoning should not reach the user"
@pytest.mark.asyncio
async def test_tick_delivers_normal_report(tmp_path, monkeypatch) -> None:
"""Normal reports should pass through deliverability and evaluator."""
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check inbox", encoding="utf-8")
from nanobot.providers.base import LLMProvider
class StubProvider(LLMProvider):
async def chat(self, **kwargs) -> LLMResponse:
return LLMResponse(
content="",
tool_calls=[
ToolCallRequest(
id="hb_1", name="heartbeat",
arguments={"action": "run", "tasks": "check inbox"},
)
],
)
def get_default_model(self) -> str:
return "test-model"
notified: list[str] = []
async def _on_execute(tasks: str) -> str:
return "3 new emails — client proposal from Zain, invoice, meeting reminder."
async def _on_notify(response: str) -> None:
notified.append(response)
async def _eval_always_notify(*a, **kw):
return True
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_always_notify)
service = HeartbeatService(
workspace=tmp_path,
provider=StubProvider(),
model="test-model",
on_execute=_on_execute,
on_notify=_on_notify,
)
await service._tick()
assert notified == ["3 new emails — client proposal from Zain, invoice, meeting reminder."]