fix(image): prevent duplicate delivery and replay artifacts

This commit is contained in:
Xubin Ren 2026-05-09 05:45:13 +00:00
parent 4d168c571c
commit 3231aaf9ee
9 changed files with 128 additions and 32 deletions

View File

@ -13,12 +13,24 @@ from nanobot.config.paths import get_workspace_path
@tool_parameters(
tool_parameters_schema(
content=StringSchema("The message content to send"),
channel=StringSchema("Optional: target channel (telegram, discord, etc.)"),
chat_id=StringSchema("Optional: target chat/user ID"),
content=StringSchema(
"Message content for proactive or cross-channel delivery. "
"Do not use this for a normal reply in the current chat."
),
channel=StringSchema(
"Optional target channel for cross-channel/proactive delivery. "
"Do not set this to the current runtime channel for a normal reply."
),
chat_id=StringSchema(
"Optional target chat/user ID for cross-channel/proactive delivery. "
"Do not set this to the current runtime chat for a normal reply."
),
media=ArraySchema(
StringSchema(""),
description="Optional: list of file paths to attach (images, video, audio, documents)",
description=(
"Optional list of existing file paths to attach for proactive or cross-channel delivery. "
"Do not use this to resend generate_image outputs in the current chat."
),
),
buttons=ArraySchema(
ArraySchema(StringSchema("Button label")),
@ -100,9 +112,14 @@ class MessageTool(Tool):
@property
def description(self) -> str:
return (
"Send a message to the user, optionally with file attachments. "
"This is the ONLY way to deliver files (images, documents, audio, video) to the user. "
"Use the 'media' parameter with file paths to attach files. "
"Proactively send a message to a user/channel, optionally with file attachments. "
"Use this for reminders, cross-channel delivery, or explicit proactive sends. "
"Do not use this for the normal reply in the current chat: answer naturally instead. "
"If channel/chat_id would target the current runtime conversation, do not call this tool "
"unless the user explicitly asked you to proactively send an existing file attachment. "
"When generate_image creates images in the current chat, the final assistant reply "
"automatically attaches them; do not call message just to announce or resend them. "
"For proactive attachment delivery, use the 'media' parameter with file paths. "
"Do NOT use read_file to send files — that only reads content for your own analysis."
)

View File

@ -2,6 +2,7 @@
import json
import os
import re
import shutil
from contextlib import suppress
from dataclasses import dataclass, field
@ -21,6 +22,25 @@ from nanobot.utils.helpers import (
)
FILE_MAX_MESSAGES = 2000
_MESSAGE_TIME_PREFIX_RE = re.compile(r"^\[Message Time: [^\]]+\]\n?")
_LOCAL_IMAGE_BREADCRUMB_RE = re.compile(r"^\[image: (?:/|~)[^\]]+\]\s*$")
_TOOL_CALL_ECHO_RE = re.compile(r'^\s*(?:generate_image|message)\([^)]*\)\s*$')
def _sanitize_assistant_replay_text(content: str) -> str:
"""Remove internal replay artifacts that the model may have copied before.
These strings are useful as runtime/session metadata, but when they appear
in assistant examples they become demonstrations for the model to repeat.
"""
content = _MESSAGE_TIME_PREFIX_RE.sub("", content, count=1)
lines = [
line
for line in content.splitlines()
if not _LOCAL_IMAGE_BREADCRUMB_RE.match(line)
and not _TOOL_CALL_ECHO_RE.match(line)
]
return "\n".join(lines).strip()
@dataclass
@ -41,22 +61,15 @@ class Session:
Annotating *every* assistant turn trains the model (via in-context
demonstrations) to start its own replies with the same
``[Message Time: ...]`` prefix, which leaks metadata back to the user.
We therefore only annotate:
* ``user`` turns needed so the model can pin the conversation in time.
* proactive deliveries (``_channel_delivery=True``) cron / heartbeat
assistant pushes that may sit hours away from the next user reply,
and are too infrequent to act as parroting demonstrations.
We therefore only annotate user turns. User-side stamps are enough to
pin adjacent assistant replies for relative-time reasoning, including
proactive messages the user replies to later.
"""
timestamp = message.get("timestamp")
if not timestamp or not isinstance(content, str):
return content
role = message.get("role")
if role == "user":
pass
elif role == "assistant" and message.get("_channel_delivery"):
pass
else:
if role != "user":
return content
return f"[Message Time: {timestamp}]\n{content}"
@ -105,19 +118,25 @@ class Session:
out: list[dict[str, Any]] = []
for message in sliced:
content = message.get("content", "")
role = message.get("role")
if role == "assistant" and isinstance(content, str):
content = _sanitize_assistant_replay_text(content)
# Synthesize an ``[image: path]`` breadcrumb from the persisted
# ``media`` kwarg so LLM replay still sees *something* where the
# image used to be. Without this, an image-only user turn
# replays as an empty user message — the assistant's reply then
# looks like it's responding to nothing.
media = message.get("media")
if isinstance(media, list) and media and isinstance(content, str):
if role == "user" and isinstance(media, list) and media and isinstance(content, str):
breadcrumbs = "\n".join(
image_placeholder_text(p) for p in media if isinstance(p, str) and p
)
content = f"{content}\n{breadcrumbs}" if content else breadcrumbs
if include_timestamps:
content = self._annotate_message_time(message, content)
if role == "assistant" and isinstance(content, str) and not content.strip():
if not any(key in message for key in ("tool_calls", "reasoning_content", "thinking_blocks")):
continue
entry: dict[str, Any] = {"role": message["role"], "content": content}
for key in ("tool_calls", "tool_call_id", "name", "reasoning_content", "thinking_blocks"):
if key in message:

View File

@ -15,6 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
- Image editing: pass the saved artifact path or user image path in `reference_images`.
- Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
- Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
- In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.
## Prompt Rules
@ -39,9 +40,11 @@ In normal user-facing replies, do not expose local filesystem paths. Keep the re
For follow-up edits, pass the prior artifact `path` to `reference_images`. If the user provides a new uploaded image, use that path as the reference instead.
Do not include internal replay markers such as `[Message Time: ...]`, `[image: /local/path]`, `generate_image(...)`, or `message(...)` in user-facing replies.
## Provider Notes
Do not ask users to paste API keys into chat. If configuration is needed, describe the fields and remind them to restart the gateway after changing config.
Do not ask users to paste API keys into chat. If configuration is needed, describe the fields; LLM provider and BYOK changes are hot-reloaded for new turns.
For OpenRouter, the image tool expects:

View File

@ -28,5 +28,7 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain
- On broad searches, use `grep(output_mode="count")` to scope before requesting full content.
{% include 'agent/_snippets/untrusted_content.md' %}
Reply directly with text for conversations. Only use the 'message' tool to send to a specific chat channel.
IMPORTANT: To send files (images, video, audio, documents) to the user, you MUST call the 'message' tool with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Examples: message(content="Here is the image", media=["/path/to/file.png"]) or message(content="Here is the video", media=["/path/to/video.mp4"])
Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])

View File

@ -115,7 +115,8 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
"artifacts": artifacts,
"next_step": (
"Use these artifact paths as reference_images for follow-up edits. "
"Mention the image id/path to the user; do not paste base64."
"For the current chat, reply naturally; the runtime attaches generated images automatically. "
"Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
),
},
ensure_ascii=False,

View File

@ -289,6 +289,18 @@ def test_build_messages_passes_channel_to_system_prompt(tmp_path) -> None:
assert "messaging app" in system
def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path) -> None:
workspace = _make_workspace(tmp_path)
builder = ContextBuilder(workspace)
prompt = builder.build_system_prompt(channel="slack")
assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
assert "do not call 'message' just to announce or resend them" in prompt
assert "Wait for the tool results, then answer once" in prompt
def test_subagent_result_does_not_create_consecutive_assistant_messages(tmp_path) -> None:
workspace = _make_workspace(tmp_path)
builder = ContextBuilder(workspace)

View File

@ -245,14 +245,8 @@ def test_get_history_annotates_user_turns_but_not_assistant_turns():
]
def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
"""Cron / heartbeat assistant pushes still carry a timestamp prefix.
These proactive deliveries can sit hours away from the next user reply,
so the model needs to know when they fired. They are rare enough that
they don't act as in-context demonstrations encouraging the model to
prefix its own normal replies with ``[Message Time: ...]``.
"""
def test_get_history_does_not_annotate_proactive_assistant_deliveries_with_timestamps():
"""Assistant-side timestamp examples can leak back into future replies."""
session = Session(key="test:proactive-timestamps")
session.messages.append({
"role": "assistant",
@ -271,7 +265,7 @@ def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
assert history == [
{
"role": "assistant",
"content": "[Message Time: 2026-04-26T15:00:00]\n记得喝水",
"content": "记得喝水",
},
{
"role": "user",
@ -370,6 +364,41 @@ def test_get_history_ignores_media_kwarg_on_non_user_rows():
assert history[0]["content"] == [{"type": "text", "text": "structured"}]
def test_get_history_does_not_paste_assistant_media_paths_into_replay():
session = Session(key="test:assistant-media")
session.messages.append(
{
"role": "assistant",
"content": "来了 🎨",
"media": ["/home/user/.nanobot/media/generated/img_abc.png"],
}
)
history = session.get_history(max_messages=500)
assert history == [{"role": "assistant", "content": "来了 🎨"}]
def test_get_history_sanitizes_existing_assistant_replay_artifacts():
session = Session(key="test:polluted-assistant")
session.messages.append(
{
"role": "assistant",
"content": (
"[Message Time: 2026-05-09 00:33:48]\n"
"来了 🎨\n"
"[image: /home/user/.nanobot/media/generated/img_old.png]\n\n"
"generate_image(\"16:9\")\n"
"message(\"来了 🎨\")"
),
}
)
history = session.get_history(max_messages=500, include_timestamps=True)
assert history == [{"role": "assistant", "content": "来了 🎨"}]
def test_get_history_respects_max_tokens(monkeypatch):
session = Session(key="test:token-cap")
session.messages.extend(

View File

@ -166,3 +166,13 @@ class TestMessageToolTurnTracking:
tool._sent_in_turn = True
tool.start_turn()
assert not tool._sent_in_turn
def test_schema_discourages_current_chat_replies(self) -> None:
tool = MessageTool()
assert "Do not use this for the normal reply in the current chat" in tool.description
assert "generate_image creates images in the current chat" in tool.description
assert (
"Do not use this for a normal reply in the current chat"
in tool.parameters["properties"]["content"]["description"]
)

View File

@ -75,6 +75,7 @@ def test_generated_image_paths_from_tool_results() -> None:
{"id": "img_2", "path": "/tmp/two.png"},
]
)
payload = json.loads(result)
assert generated_image_paths_from_messages(
[
@ -82,3 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
{"role": "tool", "name": "other", "content": result},
]
) == ["/tmp/one.png", "/tmp/two.png"]
assert "runtime attaches generated images automatically" in payload["next_step"]
assert "Do not call message" in payload["next_step"]