fix(image): prevent duplicate delivery and replay artifacts

2026-05-19 16:12:30 +00:00 · 2026-05-09 05:45:13 +00:00 · 2026-05-09 05:45:13 +00:00 · 3231aaf9ee
commit 3231aaf9ee
parent 4d168c571c
9 changed files with 128 additions and 32 deletions
--- a/nanobot/agent/tools/message.py
+++ b/nanobot/agent/tools/message.py
@ -13,12 +13,24 @@ from nanobot.config.paths import get_workspace_path

@tool_parameters(
    tool_parameters_schema(
-        content=StringSchema("The message content to send"),
-        channel=StringSchema("Optional: target channel (telegram, discord, etc.)"),
-        chat_id=StringSchema("Optional: target chat/user ID"),
+        content=StringSchema(
+            "Message content for proactive or cross-channel delivery. "
+            "Do not use this for a normal reply in the current chat."
+        ),
+        channel=StringSchema(
+            "Optional target channel for cross-channel/proactive delivery. "
+            "Do not set this to the current runtime channel for a normal reply."
+        ),
+        chat_id=StringSchema(
+            "Optional target chat/user ID for cross-channel/proactive delivery. "
+            "Do not set this to the current runtime chat for a normal reply."
+        ),
        media=ArraySchema(
            StringSchema(""),
-            description="Optional: list of file paths to attach (images, video, audio, documents)",
+            description=(
+                "Optional list of existing file paths to attach for proactive or cross-channel delivery. "
+                "Do not use this to resend generate_image outputs in the current chat."
+            ),
        ),
        buttons=ArraySchema(
            ArraySchema(StringSchema("Button label")),
@ -100,9 +112,14 @@ class MessageTool(Tool):
    @property
    def description(self) -> str:
        return (
-            "Send a message to the user, optionally with file attachments. "
-            "This is the ONLY way to deliver files (images, documents, audio, video) to the user. "
-            "Use the 'media' parameter with file paths to attach files. "
+            "Proactively send a message to a user/channel, optionally with file attachments. "
+            "Use this for reminders, cross-channel delivery, or explicit proactive sends. "
+            "Do not use this for the normal reply in the current chat: answer naturally instead. "
+            "If channel/chat_id would target the current runtime conversation, do not call this tool "
+            "unless the user explicitly asked you to proactively send an existing file attachment. "
+            "When generate_image creates images in the current chat, the final assistant reply "
+            "automatically attaches them; do not call message just to announce or resend them. "
+            "For proactive attachment delivery, use the 'media' parameter with file paths. "
            "Do NOT use read_file to send files — that only reads content for your own analysis."
        )

--- a/nanobot/session/manager.py
+++ b/nanobot/session/manager.py
@ -2,6 +2,7 @@

 import json
 import os
+import re
 import shutil
 from contextlib import suppress
 from dataclasses import dataclass, field
@ -21,6 +22,25 @@ from nanobot.utils.helpers import (
 )

 FILE_MAX_MESSAGES = 2000
+_MESSAGE_TIME_PREFIX_RE = re.compile(r"^\[Message Time: [^\]]+\]\n?")
+_LOCAL_IMAGE_BREADCRUMB_RE = re.compile(r"^\[image: (?:/|~)[^\]]+\]\s*$")
+_TOOL_CALL_ECHO_RE = re.compile(r'^\s*(?:generate_image|message)\([^)]*\)\s*$')
+
+
+def _sanitize_assistant_replay_text(content: str) -> str:
+    """Remove internal replay artifacts that the model may have copied before.
+
+    These strings are useful as runtime/session metadata, but when they appear
+    in assistant examples they become demonstrations for the model to repeat.
+    """
+    content = _MESSAGE_TIME_PREFIX_RE.sub("", content, count=1)
+    lines = [
+        line
+        for line in content.splitlines()
+        if not _LOCAL_IMAGE_BREADCRUMB_RE.match(line)
+        and not _TOOL_CALL_ECHO_RE.match(line)
+    ]
+    return "\n".join(lines).strip()


@dataclass
@ -41,22 +61,15 @@ class Session:
        Annotating *every* assistant turn trains the model (via in-context
        demonstrations) to start its own replies with the same
        ``[Message Time: ...]`` prefix, which leaks metadata back to the user.
-        We therefore only annotate:
-
-        * ``user`` turns — needed so the model can pin the conversation in time.
-        * proactive deliveries (``_channel_delivery=True``) — cron / heartbeat
-          assistant pushes that may sit hours away from the next user reply,
-          and are too infrequent to act as parroting demonstrations.
+        We therefore only annotate user turns. User-side stamps are enough to
+        pin adjacent assistant replies for relative-time reasoning, including
+        proactive messages the user replies to later.
        """
        timestamp = message.get("timestamp")
        if not timestamp or not isinstance(content, str):
            return content
        role = message.get("role")
-        if role == "user":
-            pass
-        elif role == "assistant" and message.get("_channel_delivery"):
-            pass
-        else:
+        if role != "user":
            return content
        return f"[Message Time: {timestamp}]\n{content}"

@ -105,19 +118,25 @@ class Session:
        out: list[dict[str, Any]] = []
        for message in sliced:
            content = message.get("content", "")
+            role = message.get("role")
+            if role == "assistant" and isinstance(content, str):
+                content = _sanitize_assistant_replay_text(content)
            # Synthesize an ``[image: path]`` breadcrumb from the persisted
            # ``media`` kwarg so LLM replay still sees *something* where the
            # image used to be. Without this, an image-only user turn
            # replays as an empty user message — the assistant's reply then
            # looks like it's responding to nothing.
            media = message.get("media")
-            if isinstance(media, list) and media and isinstance(content, str):
+            if role == "user" and isinstance(media, list) and media and isinstance(content, str):
                breadcrumbs = "\n".join(
                    image_placeholder_text(p) for p in media if isinstance(p, str) and p
                )
                content = f"{content}\n{breadcrumbs}" if content else breadcrumbs
            if include_timestamps:
                content = self._annotate_message_time(message, content)
+            if role == "assistant" and isinstance(content, str) and not content.strip():
+                if not any(key in message for key in ("tool_calls", "reasoning_content", "thinking_blocks")):
+                    continue
            entry: dict[str, Any] = {"role": message["role"], "content": content}
            for key in ("tool_calls", "tool_call_id", "name", "reasoning_content", "thinking_blocks"):
                if key in message:
--- a/nanobot/skills/image-generation/SKILL.md
+++ b/nanobot/skills/image-generation/SKILL.md
@ -15,6 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
 - Image editing: pass the saved artifact path or user image path in `reference_images`.
 - Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
 - Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
+- In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.

 ## Prompt Rules

@ -39,9 +40,11 @@ In normal user-facing replies, do not expose local filesystem paths. Keep the re

 For follow-up edits, pass the prior artifact `path` to `reference_images`. If the user provides a new uploaded image, use that path as the reference instead.

+Do not include internal replay markers such as `[Message Time: ...]`, `[image: /local/path]`, `generate_image(...)`, or `message(...)` in user-facing replies.
+
 ## Provider Notes

-Do not ask users to paste API keys into chat. If configuration is needed, describe the fields and remind them to restart the gateway after changing config.
+Do not ask users to paste API keys into chat. If configuration is needed, describe the fields; LLM provider and BYOK changes are hot-reloaded for new turns.

 For OpenRouter, the image tool expects:

--- a/nanobot/templates/agent/identity.md
+++ b/nanobot/templates/agent/identity.md
@ -28,5 +28,7 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain
 - On broad searches, use `grep(output_mode="count")` to scope before requesting full content.
 {% include 'agent/_snippets/untrusted_content.md' %}

-Reply directly with text for conversations. Only use the 'message' tool to send to a specific chat channel.
-IMPORTANT: To send files (images, video, audio, documents) to the user, you MUST call the 'message' tool with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Examples: message(content="Here is the image", media=["/path/to/file.png"]) or message(content="Here is the video", media=["/path/to/video.mp4"])
+Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
+When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
+Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
+To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])
--- a/nanobot/utils/artifacts.py
+++ b/nanobot/utils/artifacts.py
@ -115,7 +115,8 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
            "artifacts": artifacts,
            "next_step": (
                "Use these artifact paths as reference_images for follow-up edits. "
-                "Mention the image id/path to the user; do not paste base64."
+                "For the current chat, reply naturally; the runtime attaches generated images automatically. "
+                "Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
            ),
        },
        ensure_ascii=False,
--- a/tests/agent/test_context_prompt_cache.py
+++ b/tests/agent/test_context_prompt_cache.py
@ -289,6 +289,18 @@ def test_build_messages_passes_channel_to_system_prompt(tmp_path) -> None:
    assert "messaging app" in system


+def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path) -> None:
+    workspace = _make_workspace(tmp_path)
+    builder = ContextBuilder(workspace)
+
+    prompt = builder.build_system_prompt(channel="slack")
+
+    assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
+    assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
+    assert "do not call 'message' just to announce or resend them" in prompt
+    assert "Wait for the tool results, then answer once" in prompt
+
+
 def test_subagent_result_does_not_create_consecutive_assistant_messages(tmp_path) -> None:
    workspace = _make_workspace(tmp_path)
    builder = ContextBuilder(workspace)
--- a/tests/agent/test_session_manager_history.py
+++ b/tests/agent/test_session_manager_history.py
@ -245,14 +245,8 @@ def test_get_history_annotates_user_turns_but_not_assistant_turns():
    ]


-def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
-    """Cron / heartbeat assistant pushes still carry a timestamp prefix.
-
-    These proactive deliveries can sit hours away from the next user reply,
-    so the model needs to know when they fired. They are rare enough that
-    they don't act as in-context demonstrations encouraging the model to
-    prefix its own normal replies with ``[Message Time: ...]``.
-    """
+def test_get_history_does_not_annotate_proactive_assistant_deliveries_with_timestamps():
+    """Assistant-side timestamp examples can leak back into future replies."""
    session = Session(key="test:proactive-timestamps")
    session.messages.append({
        "role": "assistant",
@ -271,7 +265,7 @@ def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
    assert history == [
        {
            "role": "assistant",
-            "content": "[Message Time: 2026-04-26T15:00:00]\n记得喝水",
+            "content": "记得喝水",
        },
        {
            "role": "user",
@ -370,6 +364,41 @@ def test_get_history_ignores_media_kwarg_on_non_user_rows():
    assert history[0]["content"] == [{"type": "text", "text": "structured"}]


+def test_get_history_does_not_paste_assistant_media_paths_into_replay():
+    session = Session(key="test:assistant-media")
+    session.messages.append(
+        {
+            "role": "assistant",
+            "content": "来了 🎨",
+            "media": ["/home/user/.nanobot/media/generated/img_abc.png"],
+        }
+    )
+
+    history = session.get_history(max_messages=500)
+
+    assert history == [{"role": "assistant", "content": "来了 🎨"}]
+
+
+def test_get_history_sanitizes_existing_assistant_replay_artifacts():
+    session = Session(key="test:polluted-assistant")
+    session.messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "[Message Time: 2026-05-09 00:33:48]\n"
+                "来了 🎨\n"
+                "[image: /home/user/.nanobot/media/generated/img_old.png]\n\n"
+                "generate_image(\"16:9\")\n"
+                "message(\"来了 🎨\")"
+            ),
+        }
+    )
+
+    history = session.get_history(max_messages=500, include_timestamps=True)
+
+    assert history == [{"role": "assistant", "content": "来了 🎨"}]
+
+
 def test_get_history_respects_max_tokens(monkeypatch):
    session = Session(key="test:token-cap")
    session.messages.extend(
--- a/tests/tools/test_message_tool_suppress.py
+++ b/tests/tools/test_message_tool_suppress.py
@ -166,3 +166,13 @@ class TestMessageToolTurnTracking:
        tool._sent_in_turn = True
        tool.start_turn()
        assert not tool._sent_in_turn
+
+    def test_schema_discourages_current_chat_replies(self) -> None:
+        tool = MessageTool()
+
+        assert "Do not use this for the normal reply in the current chat" in tool.description
+        assert "generate_image creates images in the current chat" in tool.description
+        assert (
+            "Do not use this for a normal reply in the current chat"
+            in tool.parameters["properties"]["content"]["description"]
+        )
--- a/tests/utils/test_artifacts.py
+++ b/tests/utils/test_artifacts.py
@ -75,6 +75,7 @@ def test_generated_image_paths_from_tool_results() -> None:
            {"id": "img_2", "path": "/tmp/two.png"},
        ]
    )
+    payload = json.loads(result)

    assert generated_image_paths_from_messages(
        [
@ -82,3 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
            {"role": "tool", "name": "other", "content": result},
        ]
    ) == ["/tmp/one.png", "/tmp/two.png"]
+    assert "runtime attaches generated images automatically" in payload["next_step"]
+    assert "Do not call message" in payload["next_step"]