nanobot/nanobot/utils/evaluator.py
Xubin Ren 14ee7cb121 style: revert unrelated Black-style formatting churn (#3220)
The earlier commits picked up a large amount of Black-style reformatting
(multi-line frozenset / keyword-arg wrapping / docstring blanks / removed
parens) on top of the actual guard fix. @chengyongru flagged it; the
first pass reverted some but not all.

This restores nanobot/providers/base.py, runner.py, heartbeat/service.py,
and utils/evaluator.py to origin/main and reapplies only the guard logic:

  - base.py: add should_execute_tools property
  - runner.py / heartbeat/service.py / utils/evaluator.py: route through it
    + log a warning when has_tool_calls but finish_reason is anomalous

Net diff vs main is now +87/-4 (was +211/-102) — roughly 30 lines of real
logic, which is what the PR is actually about.

Behavior unchanged from previous HEAD; full suite still 2014 passed.

Made-with: Cursor
2026-04-17 20:39:46 +08:00

90 lines
3.0 KiB
Python

"""Post-run evaluation for background tasks (heartbeat & cron).
After the agent executes a background task, this module makes a lightweight
LLM call to decide whether the result warrants notifying the user.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from loguru import logger
from nanobot.utils.prompt_templates import render_template
if TYPE_CHECKING:
from nanobot.providers.base import LLMProvider
_EVALUATE_TOOL = [
{
"type": "function",
"function": {
"name": "evaluate_notification",
"description": "Decide whether the user should be notified about this background task result.",
"parameters": {
"type": "object",
"properties": {
"should_notify": {
"type": "boolean",
"description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress",
},
"reason": {
"type": "string",
"description": "One-sentence reason for the decision",
},
},
"required": ["should_notify"],
},
},
}
]
async def evaluate_response(
response: str,
task_context: str,
provider: LLMProvider,
model: str,
) -> bool:
"""Decide whether a background-task result should be delivered to the user.
Uses a lightweight tool-call LLM request (same pattern as heartbeat
``_decide()``). Falls back to ``True`` (notify) on any failure so
that important messages are never silently dropped.
"""
try:
llm_response = await provider.chat_with_retry(
messages=[
{"role": "system", "content": render_template("agent/evaluator.md", part="system")},
{"role": "user", "content": render_template(
"agent/evaluator.md",
part="user",
task_context=task_context,
response=response,
)},
],
tools=_EVALUATE_TOOL,
model=model,
max_tokens=256,
temperature=0.0,
)
if not llm_response.should_execute_tools:
if llm_response.has_tool_calls:
logger.warning(
"evaluate_response: ignoring tool calls under finish_reason='{}', defaulting to notify",
llm_response.finish_reason,
)
else:
logger.warning("evaluate_response: no tool call returned, defaulting to notify")
return True
args = llm_response.tool_calls[0].arguments
should_notify = args.get("should_notify", True)
reason = args.get("reason", "")
logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason)
return bool(should_notify)
except Exception:
logger.exception("evaluate_response failed, defaulting to notify")
return True