fix(image): prevent duplicate delivery and replay artifacts

2026-05-20 00:22:31 +00:00 · 2026-05-09 05:45:13 +00:00 · 2026-05-09 05:45:13 +00:00 · 3231aaf9ee
commit 3231aaf9ee
parent 4d168c571c
9 changed files with 128 additions and 32 deletions
--- a/nanobot/agent/tools/message.py
+++ b/nanobot/agent/tools/message.py
@ -13,12 +13,24 @@ from nanobot.config.paths import get_workspace_path
@tool_parameters(
    tool_parameters_schema(
-        content=StringSchema("The message content to send"),
+        content=StringSchema(
-        channel=StringSchema("Optional: target channel (telegram, discord, etc.)"),
+            "Message content for proactive or cross-channel delivery. "
-        chat_id=StringSchema("Optional: target chat/user ID"),
+            "Do not use this for a normal reply in the current chat."
        ),
        channel=StringSchema(
            "Optional target channel for cross-channel/proactive delivery. "
            "Do not set this to the current runtime channel for a normal reply."
        ),
        chat_id=StringSchema(
            "Optional target chat/user ID for cross-channel/proactive delivery. "
            "Do not set this to the current runtime chat for a normal reply."
        ),
        media=ArraySchema(
            StringSchema(""),
-            description="Optional: list of file paths to attach (images, video, audio, documents)",
+            description=(
                "Optional list of existing file paths to attach for proactive or cross-channel delivery. "
                "Do not use this to resend generate_image outputs in the current chat."
            ),
        ),
        buttons=ArraySchema(
            ArraySchema(StringSchema("Button label")),
@ -100,9 +112,14 @@ class MessageTool(Tool):
    @property
    def description(self) -> str:
        return (
-            "Send a message to the user, optionally with file attachments. "
+            "Proactively send a message to a user/channel, optionally with file attachments. "
-            "This is the ONLY way to deliver files (images, documents, audio, video) to the user. "
+            "Use this for reminders, cross-channel delivery, or explicit proactive sends. "
-            "Use the 'media' parameter with file paths to attach files. "
+            "Do not use this for the normal reply in the current chat: answer naturally instead. "
            "If channel/chat_id would target the current runtime conversation, do not call this tool "
            "unless the user explicitly asked you to proactively send an existing file attachment. "
            "When generate_image creates images in the current chat, the final assistant reply "
            "automatically attaches them; do not call message just to announce or resend them. "
            "For proactive attachment delivery, use the 'media' parameter with file paths. "
            "Do NOT use read_file to send files — that only reads content for your own analysis."
        )
--- a/nanobot/session/manager.py
+++ b/nanobot/session/manager.py
@ -2,6 +2,7 @@
 import json
 import os
 import re
 import shutil
 from contextlib import suppress
 from dataclasses import dataclass, field
@ -21,6 +22,25 @@ from nanobot.utils.helpers import (
 )
 FILE_MAX_MESSAGES = 2000
 _MESSAGE_TIME_PREFIX_RE = re.compile(r"^\[Message Time: [^\]]+\]\n?")
 _LOCAL_IMAGE_BREADCRUMB_RE = re.compile(r"^\[image: (?:/|~)[^\]]+\]\s*$")
 _TOOL_CALL_ECHO_RE = re.compile(r'^\s*(?:generate_image|message)\([^)]*\)\s*$')
 def _sanitize_assistant_replay_text(content: str) -> str:
    """Remove internal replay artifacts that the model may have copied before.
    These strings are useful as runtime/session metadata, but when they appear
    in assistant examples they become demonstrations for the model to repeat.
    """
    content = _MESSAGE_TIME_PREFIX_RE.sub("", content, count=1)
    lines = [
        line
        for line in content.splitlines()
        if not _LOCAL_IMAGE_BREADCRUMB_RE.match(line)
        and not _TOOL_CALL_ECHO_RE.match(line)
    ]
    return "\n".join(lines).strip()
@dataclass
@ -41,22 +61,15 @@ class Session:
        Annotating *every* assistant turn trains the model (via in-context
        demonstrations) to start its own replies with the same
        ``[Message Time: ...]`` prefix, which leaks metadata back to the user.
-        We therefore only annotate:
+        We therefore only annotate user turns. User-side stamps are enough to
-
+        pin adjacent assistant replies for relative-time reasoning, including
-        * ``user`` turns — needed so the model can pin the conversation in time.
+        proactive messages the user replies to later.
        * proactive deliveries (``_channel_delivery=True``) — cron / heartbeat
          assistant pushes that may sit hours away from the next user reply,
          and are too infrequent to act as parroting demonstrations.
        """
        timestamp = message.get("timestamp")
        if not timestamp or not isinstance(content, str):
            return content
        role = message.get("role")
-        if role == "user":
+        if role != "user":
            pass
        elif role == "assistant" and message.get("_channel_delivery"):
            pass
        else:
            return content
        return f"[Message Time: {timestamp}]\n{content}"
@ -105,19 +118,25 @@ class Session:
        out: list[dict[str, Any]] = []
        for message in sliced:
            content = message.get("content", "")
            role = message.get("role")
            if role == "assistant" and isinstance(content, str):
                content = _sanitize_assistant_replay_text(content)
            # Synthesize an ``[image: path]`` breadcrumb from the persisted
            # ``media`` kwarg so LLM replay still sees *something* where the
            # image used to be. Without this, an image-only user turn
            # replays as an empty user message — the assistant's reply then
            # looks like it's responding to nothing.
            media = message.get("media")
-            if isinstance(media, list) and media and isinstance(content, str):
+            if role == "user" and isinstance(media, list) and media and isinstance(content, str):
                breadcrumbs = "\n".join(
                    image_placeholder_text(p) for p in media if isinstance(p, str) and p
                )
                content = f"{content}\n{breadcrumbs}" if content else breadcrumbs
            if include_timestamps:
                content = self._annotate_message_time(message, content)
            if role == "assistant" and isinstance(content, str) and not content.strip():
                if not any(key in message for key in ("tool_calls", "reasoning_content", "thinking_blocks")):
                    continue
            entry: dict[str, Any] = {"role": message["role"], "content": content}
            for key in ("tool_calls", "tool_call_id", "name", "reasoning_content", "thinking_blocks"):
                if key in message:
--- a/nanobot/skills/image-generation/SKILL.md
+++ b/nanobot/skills/image-generation/SKILL.md
@ -15,6 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
 - Image editing: pass the saved artifact path or user image path in `reference_images`.
 - Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
 - Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
 - In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.
 ## Prompt Rules
@ -39,9 +40,11 @@ In normal user-facing replies, do not expose local filesystem paths. Keep the re
 For follow-up edits, pass the prior artifact `path` to `reference_images`. If the user provides a new uploaded image, use that path as the reference instead.
 Do not include internal replay markers such as `[Message Time: ...]`, `[image: /local/path]`, `generate_image(...)`, or `message(...)` in user-facing replies.
 ## Provider Notes
-Do not ask users to paste API keys into chat. If configuration is needed, describe the fields and remind them to restart the gateway after changing config.
+Do not ask users to paste API keys into chat. If configuration is needed, describe the fields; LLM provider and BYOK changes are hot-reloaded for new turns.
 For OpenRouter, the image tool expects:
--- a/nanobot/templates/agent/identity.md
+++ b/nanobot/templates/agent/identity.md
@ -28,5 +28,7 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain
 - On broad searches, use `grep(output_mode="count")` to scope before requesting full content.
 {% include 'agent/_snippets/untrusted_content.md' %}
-Reply directly with text for conversations. Only use the 'message' tool to send to a specific chat channel.
+Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
-IMPORTANT: To send files (images, video, audio, documents) to the user, you MUST call the 'message' tool with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Examples: message(content="Here is the image", media=["/path/to/file.png"]) or message(content="Here is the video", media=["/path/to/video.mp4"])
+When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
 Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
 To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])
--- a/nanobot/utils/artifacts.py
+++ b/nanobot/utils/artifacts.py
@ -115,7 +115,8 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
            "artifacts": artifacts,
            "next_step": (
                "Use these artifact paths as reference_images for follow-up edits. "
-                "Mention the image id/path to the user; do not paste base64."
+                "For the current chat, reply naturally; the runtime attaches generated images automatically. "
                "Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
            ),
        },
        ensure_ascii=False,
--- a/tests/agent/test_context_prompt_cache.py
+++ b/tests/agent/test_context_prompt_cache.py
@ -289,6 +289,18 @@ def test_build_messages_passes_channel_to_system_prompt(tmp_path) -> None:
    assert "messaging app" in system
 def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path) -> None:
    workspace = _make_workspace(tmp_path)
    builder = ContextBuilder(workspace)
    prompt = builder.build_system_prompt(channel="slack")
    assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
    assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
    assert "do not call 'message' just to announce or resend them" in prompt
    assert "Wait for the tool results, then answer once" in prompt
 def test_subagent_result_does_not_create_consecutive_assistant_messages(tmp_path) -> None:
    workspace = _make_workspace(tmp_path)
    builder = ContextBuilder(workspace)
--- a/tests/agent/test_session_manager_history.py
+++ b/tests/agent/test_session_manager_history.py
@ -245,14 +245,8 @@ def test_get_history_annotates_user_turns_but_not_assistant_turns():
    ]
-def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
+def test_get_history_does_not_annotate_proactive_assistant_deliveries_with_timestamps():
-    """Cron / heartbeat assistant pushes still carry a timestamp prefix.
+    """Assistant-side timestamp examples can leak back into future replies."""
    These proactive deliveries can sit hours away from the next user reply,
    so the model needs to know when they fired. They are rare enough that
    they don't act as in-context demonstrations encouraging the model to
    prefix its own normal replies with ``[Message Time: ...]``.
    """
    session = Session(key="test:proactive-timestamps")
    session.messages.append({
        "role": "assistant",
@ -271,7 +265,7 @@ def test_get_history_annotates_proactive_assistant_deliveries_with_timestamps():
    assert history == [
        {
            "role": "assistant",
-            "content": "[Message Time: 2026-04-26T15:00:00]\n记得喝水",
+            "content": "记得喝水",
        },
        {
            "role": "user",
@ -370,6 +364,41 @@ def test_get_history_ignores_media_kwarg_on_non_user_rows():
    assert history[0]["content"] == [{"type": "text", "text": "structured"}]
 def test_get_history_does_not_paste_assistant_media_paths_into_replay():
    session = Session(key="test:assistant-media")
    session.messages.append(
        {
            "role": "assistant",
            "content": "来了 🎨",
            "media": ["/home/user/.nanobot/media/generated/img_abc.png"],
        }
    )
    history = session.get_history(max_messages=500)
    assert history == [{"role": "assistant", "content": "来了 🎨"}]
 def test_get_history_sanitizes_existing_assistant_replay_artifacts():
    session = Session(key="test:polluted-assistant")
    session.messages.append(
        {
            "role": "assistant",
            "content": (
                "[Message Time: 2026-05-09 00:33:48]\n"
                "来了 🎨\n"
                "[image: /home/user/.nanobot/media/generated/img_old.png]\n\n"
                "generate_image(\"16:9\")\n"
                "message(\"来了 🎨\")"
            ),
        }
    )
    history = session.get_history(max_messages=500, include_timestamps=True)
    assert history == [{"role": "assistant", "content": "来了 🎨"}]
 def test_get_history_respects_max_tokens(monkeypatch):
    session = Session(key="test:token-cap")
    session.messages.extend(
--- a/tests/tools/test_message_tool_suppress.py
+++ b/tests/tools/test_message_tool_suppress.py
@ -166,3 +166,13 @@ class TestMessageToolTurnTracking:
        tool._sent_in_turn = True
        tool.start_turn()
        assert not tool._sent_in_turn
    def test_schema_discourages_current_chat_replies(self) -> None:
        tool = MessageTool()
        assert "Do not use this for the normal reply in the current chat" in tool.description
        assert "generate_image creates images in the current chat" in tool.description
        assert (
            "Do not use this for a normal reply in the current chat"
            in tool.parameters["properties"]["content"]["description"]
        )
--- a/tests/utils/test_artifacts.py
+++ b/tests/utils/test_artifacts.py
@ -75,6 +75,7 @@ def test_generated_image_paths_from_tool_results() -> None:
            {"id": "img_2", "path": "/tmp/two.png"},
        ]
    )
    payload = json.loads(result)
    assert generated_image_paths_from_messages(
        [
@ -82,3 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
            {"role": "tool", "name": "other", "content": result},
        ]
    ) == ["/tmp/one.png", "/tmp/two.png"]
    assert "runtime attaches generated images automatically" in payload["next_step"]
    assert "Do not call message" in payload["next_step"]