mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-20 00:22:31 +00:00
fix(image): prevent duplicate delivery and replay artifacts
This commit is contained in:
parent
4d168c571c
commit
3231aaf9ee
@ -13,12 +13,24 @@ from nanobot.config.paths import get_workspace_path
|
|||||||
|
|
||||||
@tool_parameters(
|
@tool_parameters(
|
||||||
tool_parameters_schema(
|
tool_parameters_schema(
|
||||||
content=StringSchema("The message content to send"),
|
content=StringSchema(
|
||||||
channel=StringSchema("Optional: target channel (telegram, discord, etc.)"),
|
"Message content for proactive or cross-channel delivery. "
|
||||||
chat_id=StringSchema("Optional: target chat/user ID"),
|
"Do not use this for a normal reply in the current chat."
|
||||||
|
),
|
||||||
|
channel=StringSchema(
|
||||||
|
"Optional target channel for cross-channel/proactive delivery. "
|
||||||
|
"Do not set this to the current runtime channel for a normal reply."
|
||||||
|
),
|
||||||
|
chat_id=StringSchema(
|
||||||
|
"Optional target chat/user ID for cross-channel/proactive delivery. "
|
||||||
|
"Do not set this to the current runtime chat for a normal reply."
|
||||||
|
),
|
||||||
media=ArraySchema(
|
media=ArraySchema(
|
||||||
StringSchema(""),
|
StringSchema(""),
|
||||||
description="Optional: list of file paths to attach (images, video, audio, documents)",
|
description=(
|
||||||
|
"Optional list of existing file paths to attach for proactive or cross-channel delivery. "
|
||||||
|
"Do not use this to resend generate_image outputs in the current chat."
|
||||||
|
),
|
||||||
),
|
),
|
||||||
buttons=ArraySchema(
|
buttons=ArraySchema(
|
||||||
ArraySchema(StringSchema("Button label")),
|
ArraySchema(StringSchema("Button label")),
|
||||||
@ -100,9 +112,14 @@ class MessageTool(Tool):
|
|||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return (
|
return (
|
||||||
"Send a message to the user, optionally with file attachments. "
|
"Proactively send a message to a user/channel, optionally with file attachments. "
|
||||||
"This is the ONLY way to deliver files (images, documents, audio, video) to the user. "
|
"Use this for reminders, cross-channel delivery, or explicit proactive sends. "
|
||||||
"Use the 'media' parameter with file paths to attach files. "
|
"Do not use this for the normal reply in the current chat: answer naturally instead. "
|
||||||
|
"If channel/chat_id would target the current runtime conversation, do not call this tool "
|
||||||
|
"unless the user explicitly asked you to proactively send an existing file attachment. "
|
||||||
|
"When generate_image creates images in the current chat, the final assistant reply "
|
||||||
|
"automatically attaches them; do not call message just to announce or resend them. "
|
||||||
|
"For proactive attachment delivery, use the 'media' parameter with file paths. "
|
||||||
"Do NOT use read_file to send files — that only reads content for your own analysis."
|
"Do NOT use read_file to send files — that only reads content for your own analysis."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@ -21,6 +22,25 @@ from nanobot.utils.helpers import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
FILE_MAX_MESSAGES = 2000
|
FILE_MAX_MESSAGES = 2000
|
||||||
|
_MESSAGE_TIME_PREFIX_RE = re.compile(r"^\[Message Time: [^\]]+\]\n?")
|
||||||
|
_LOCAL_IMAGE_BREADCRUMB_RE = re.compile(r"^\[image: (?:/|~)[^\]]+\]\s*$")
|
||||||
|
_TOOL_CALL_ECHO_RE = re.compile(r'^\s*(?:generate_image|message)\([^)]*\)\s*$')
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_assistant_replay_text(content: str) -> str:
|
||||||
|
"""Remove internal replay artifacts that the model may have copied before.
|
||||||
|
|
||||||
|
These strings are useful as runtime/session metadata, but when they appear
|
||||||
|
in assistant examples they become demonstrations for the model to repeat.
|
||||||
|
"""
|
||||||
|
content = _MESSAGE_TIME_PREFIX_RE.sub("", content, count=1)
|
||||||
|
lines = [
|
||||||
|
line
|
||||||
|
for line in content.splitlines()
|
||||||
|
if not _LOCAL_IMAGE_BREADCRUMB_RE.match(line)
|
||||||
|
and not _TOOL_CALL_ECHO_RE.match(line)
|
||||||
|
]
|
||||||
|
return "\n".join(lines).strip()
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -41,22 +61,15 @@ class Session:
|
|||||||
Annotating *every* assistant turn trains the model (via in-context
|
Annotating *every* assistant turn trains the model (via in-context
|
||||||
demonstrations) to start its own replies with the same
|
demonstrations) to start its own replies with the same
|
||||||
``[Message Time: ...]`` prefix, which leaks metadata back to the user.
|
``[Message Time: ...]`` prefix, which leaks metadata back to the user.
|
||||||
We therefore only annotate:
|
We therefore only annotate user turns. User-side stamps are enough to
|
||||||
|
pin adjacent assistant replies for relative-time reasoning, including
|
||||||
* ``user`` turns — needed so the model can pin the conversation in time.
|
proactive messages the user replies to later.
|
||||||
* proactive deliveries (``_channel_delivery=True``) — cron / heartbeat
|
|
||||||
assistant pushes that may sit hours away from the next user reply,
|
|
||||||
and are too infrequent to act as parroting demonstrations.
|
|
||||||
"""
|
"""
|
||||||
timestamp = message.get("timestamp")
|
timestamp = message.get("timestamp")
|
||||||
if not timestamp or not isinstance(content, str):
|
if not timestamp or not isinstance(content, str):
|
||||||
return content
|
return content
|
||||||
role = message.get("role")
|
role = message.get("role")
|
||||||
if role == "user":
|
if role != "user":
|
||||||
pass
|
|
||||||
elif role == "assistant" and message.get("_channel_delivery"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return content
|
return content
|
||||||
return f"[Message Time: {timestamp}]\n{content}"
|
return f"[Message Time: {timestamp}]\n{content}"
|
||||||
|
|
||||||
@ -105,19 +118,25 @@ class Session:
|
|||||||
out: list[dict[str, Any]] = []
|
out: list[dict[str, Any]] = []
|
||||||
for message in sliced:
|
for message in sliced:
|
||||||
content = message.get("content", "")
|
content = message.get("content", "")
|
||||||
|
role = message.get("role")
|
||||||
|
if role == "assistant" and isinstance(content, str):
|
||||||
|
content = _sanitize_assistant_replay_text(content)
|
||||||
# Synthesize an ``[image: path]`` breadcrumb from the persisted
|
# Synthesize an ``[image: path]`` breadcrumb from the persisted
|
||||||
# ``media`` kwarg so LLM replay still sees *something* where the
|
# ``media`` kwarg so LLM replay still sees *something* where the
|
||||||
# image used to be. Without this, an image-only user turn
|
# image used to be. Without this, an image-only user turn
|
||||||
# replays as an empty user message — the assistant's reply then
|
# replays as an empty user message — the assistant's reply then
|
||||||
# looks like it's responding to nothing.
|
# looks like it's responding to nothing.
|
||||||
media = message.get("media")
|
media = message.get("media")
|
||||||
if isinstance(media, list) and media and isinstance(content, str):
|
if role == "user" and isinstance(media, list) and media and isinstance(content, str):
|
||||||
breadcrumbs = "\n".join(
|
breadcrumbs = "\n".join(
|
||||||
image_placeholder_text(p) for p in media if isinstance(p, str) and p
|
image_placeholder_text(p) for p in media if isinstance(p, str) and p
|
||||||
)
|
)
|
||||||
content = f"{content}\n{breadcrumbs}" if content else breadcrumbs
|
content = f"{content}\n{breadcrumbs}" if content else breadcrumbs
|
||||||
if include_timestamps:
|
if include_timestamps:
|
||||||
content = self._annotate_message_time(message, content)
|
content = self._annotate_message_time(message, content)
|
||||||
|
if role == "assistant" and isinstance(content, str) and not content.strip():
|
||||||
|
if not any(key in message for key in ("tool_calls", "reasoning_content", "thinking_blocks")):
|
||||||
|
continue
|
||||||
entry: dict[str, Any] = {"role": message["role"], "content": content}
|
entry: dict[str, Any] = {"role": message["role"], "content": content}
|
||||||
for key in ("tool_calls", "tool_call_id", "name", "reasoning_content", "thinking_blocks"):
|
for key in ("tool_calls", "tool_call_id", "name", "reasoning_content", "thinking_blocks"):
|
||||||
if key in message:
|
if key in message:
|
||||||
|
|||||||
@ -15,6 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
|
|||||||
- Image editing: pass the saved artifact path or user image path in `reference_images`.
|
- Image editing: pass the saved artifact path or user image path in `reference_images`.
|
||||||
- Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
|
- Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
|
||||||
- Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
|
- Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
|
||||||
|
- In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.
|
||||||
|
|
||||||
## Prompt Rules
|
## Prompt Rules
|
||||||
|
|
||||||
@ -39,9 +40,11 @@ In normal user-facing replies, do not expose local filesystem paths. Keep the re
|
|||||||
|
|
||||||
For follow-up edits, pass the prior artifact `path` to `reference_images`. If the user provides a new uploaded image, use that path as the reference instead.
|
For follow-up edits, pass the prior artifact `path` to `reference_images`. If the user provides a new uploaded image, use that path as the reference instead.
|
||||||
|
|
||||||
|
Do not include internal replay markers such as `[Message Time: ...]`, `[image: /local/path]`, `generate_image(...)`, or `message(...)` in user-facing replies.
|
||||||
|
|
||||||
## Provider Notes
|
## Provider Notes
|
||||||
|
|
||||||
Do not ask users to paste API keys into chat. If configuration is needed, describe the fields and remind them to restart the gateway after changing config.
|
Do not ask users to paste API keys into chat. If configuration is needed, describe the fields; LLM provider and BYOK changes are hot-reloaded for new turns.
|
||||||
|
|
||||||
For OpenRouter, the image tool expects:
|
For OpenRouter, the image tool expects:
|
||||||
|
|
||||||
|
|||||||
@ -28,5 +28,7 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain
|
|||||||
- On broad searches, use `grep(output_mode="count")` to scope before requesting full content.
|
- On broad searches, use `grep(output_mode="count")` to scope before requesting full content.
|
||||||
{% include 'agent/_snippets/untrusted_content.md' %}
|
{% include 'agent/_snippets/untrusted_content.md' %}
|
||||||
|
|
||||||
Reply directly with text for conversations. Only use the 'message' tool to send to a specific chat channel.
|
Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
|
||||||
IMPORTANT: To send files (images, video, audio, documents) to the user, you MUST call the 'message' tool with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Examples: message(content="Here is the image", media=["/path/to/file.png"]) or message(content="Here is the video", media=["/path/to/video.mp4"])
|
When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
|
||||||
|
Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
|
||||||
|
To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])
|
||||||
|
|||||||
@ -115,7 +115,8 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
|
|||||||
"artifacts": artifacts,
|
"artifacts": artifacts,
|
||||||
"next_step": (
|
"next_step": (
|
||||||
"Use these artifact paths as reference_images for follow-up edits. "
|
"Use these artifact paths as reference_images for follow-up edits. "
|
||||||
"Mention the image id/path to the user; do not paste base64."
|
"For the current chat, reply naturally; the runtime attaches generated images automatically. "
|
||||||
|
"Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
ensure_ascii=False,
|
ensure_ascii=False,
|
||||||
|
|||||||
@ -289,6 +289,18 @@ def test_build_messages_passes_channel_to_system_prompt(tmp_path) -> None:
|
|||||||
assert "messaging app" in system
|
assert "messaging app" in system
|
||||||
|
|
||||||
|
|
||||||
|
def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path) -> None:
|
||||||
|
workspace = _make_workspace(tmp_path)
|
||||||
|
builder = ContextBuilder(workspace)
|
||||||
|
|
||||||
|
prompt = builder.build_system_prompt(channel="slack")
|
||||||
|
|
||||||
|
assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
|
||||||
|
assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
|
||||||
|
assert "do not call 'message' just to announce or resend them" in prompt
|
||||||
|
assert "Wait for the tool results, then answer once" in prompt
|
||||||
|
|
||||||
|
|
||||||
def test_subagent_result_does_not_create_consecutive_assistant_messages(tmp_path) -> None:
|
def test_subagent_result_does_not_create_consecutive_assistant_messages(tmp_path) -> None:
|
||||||
workspace = _make_workspace(tmp_path)
|
workspace = _make_workspace(tmp_path)
|
||||||
builder = ContextBuilder(workspace)
|
builder = ContextBuilder(workspace)
|
||||||
|
|||||||
@ -245,14 +245,8 @@ def test_get_history_annotates_user_turns_but_not_assistant_turns():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
|
def test_get_history_does_not_annotate_proactive_assistant_deliveries_with_timestamps():
|
||||||
"""Cron / heartbeat assistant pushes still carry a timestamp prefix.
|
"""Assistant-side timestamp examples can leak back into future replies."""
|
||||||
|
|
||||||
These proactive deliveries can sit hours away from the next user reply,
|
|
||||||
so the model needs to know when they fired. They are rare enough that
|
|
||||||
they don't act as in-context demonstrations encouraging the model to
|
|
||||||
prefix its own normal replies with ``[Message Time: ...]``.
|
|
||||||
"""
|
|
||||||
session = Session(key="test:proactive-timestamps")
|
session = Session(key="test:proactive-timestamps")
|
||||||
session.messages.append({
|
session.messages.append({
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
@ -271,7 +265,7 @@ def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
|
|||||||
assert history == [
|
assert history == [
|
||||||
{
|
{
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": "[Message Time: 2026-04-26T15:00:00]\n记得喝水",
|
"content": "记得喝水",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -370,6 +364,41 @@ def test_get_history_ignores_media_kwarg_on_non_user_rows():
|
|||||||
assert history[0]["content"] == [{"type": "text", "text": "structured"}]
|
assert history[0]["content"] == [{"type": "text", "text": "structured"}]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_history_does_not_paste_assistant_media_paths_into_replay():
|
||||||
|
session = Session(key="test:assistant-media")
|
||||||
|
session.messages.append(
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "来了 🎨",
|
||||||
|
"media": ["/home/user/.nanobot/media/generated/img_abc.png"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
history = session.get_history(max_messages=500)
|
||||||
|
|
||||||
|
assert history == [{"role": "assistant", "content": "来了 🎨"}]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_history_sanitizes_existing_assistant_replay_artifacts():
|
||||||
|
session = Session(key="test:polluted-assistant")
|
||||||
|
session.messages.append(
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": (
|
||||||
|
"[Message Time: 2026-05-09 00:33:48]\n"
|
||||||
|
"来了 🎨\n"
|
||||||
|
"[image: /home/user/.nanobot/media/generated/img_old.png]\n\n"
|
||||||
|
"generate_image(\"16:9\")\n"
|
||||||
|
"message(\"来了 🎨\")"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
history = session.get_history(max_messages=500, include_timestamps=True)
|
||||||
|
|
||||||
|
assert history == [{"role": "assistant", "content": "来了 🎨"}]
|
||||||
|
|
||||||
|
|
||||||
def test_get_history_respects_max_tokens(monkeypatch):
|
def test_get_history_respects_max_tokens(monkeypatch):
|
||||||
session = Session(key="test:token-cap")
|
session = Session(key="test:token-cap")
|
||||||
session.messages.extend(
|
session.messages.extend(
|
||||||
|
|||||||
@ -166,3 +166,13 @@ class TestMessageToolTurnTracking:
|
|||||||
tool._sent_in_turn = True
|
tool._sent_in_turn = True
|
||||||
tool.start_turn()
|
tool.start_turn()
|
||||||
assert not tool._sent_in_turn
|
assert not tool._sent_in_turn
|
||||||
|
|
||||||
|
def test_schema_discourages_current_chat_replies(self) -> None:
|
||||||
|
tool = MessageTool()
|
||||||
|
|
||||||
|
assert "Do not use this for the normal reply in the current chat" in tool.description
|
||||||
|
assert "generate_image creates images in the current chat" in tool.description
|
||||||
|
assert (
|
||||||
|
"Do not use this for a normal reply in the current chat"
|
||||||
|
in tool.parameters["properties"]["content"]["description"]
|
||||||
|
)
|
||||||
|
|||||||
@ -75,6 +75,7 @@ def test_generated_image_paths_from_tool_results() -> None:
|
|||||||
{"id": "img_2", "path": "/tmp/two.png"},
|
{"id": "img_2", "path": "/tmp/two.png"},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
payload = json.loads(result)
|
||||||
|
|
||||||
assert generated_image_paths_from_messages(
|
assert generated_image_paths_from_messages(
|
||||||
[
|
[
|
||||||
@ -82,3 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
|
|||||||
{"role": "tool", "name": "other", "content": result},
|
{"role": "tool", "name": "other", "content": result},
|
||||||
]
|
]
|
||||||
) == ["/tmp/one.png", "/tmp/two.png"]
|
) == ["/tmp/one.png", "/tmp/two.png"]
|
||||||
|
assert "runtime attaches generated images automatically" in payload["next_step"]
|
||||||
|
assert "Do not call message" in payload["next_step"]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user