From fc1c8ea77075b4cb8042c554d81db9b9411ec838 Mon Sep 17 00:00:00 2001
From: chengyongru <2755839590@qq.com>
Date: Tue, 19 May 2026 00:42:56 +0800
Subject: [PATCH] fix(image-generation): let LLM deliver images via message
 tool instead of runtime media attachment

The runtime media-attachment mechanism was broken for streaming channels
(e.g. WebSocket): the _streamed flag caused _send_once to skip the final
OutboundMessage that carried generated media, so images were never delivered.

Rather than adding complex coordination between streaming and media delivery,
delegate image delivery to the LLM: after generate_image returns artifact
paths, the next_step prompt now instructs the LLM to call the message tool
with the paths in the media parameter. This works uniformly across all
channels, streaming or not.

Remove generated_media from TurnContext, _assemble_outbound, and _state_save.
Update prompts in identity.md, SKILL.md, message tool description, and
artifacts.py to reflect the new flow.
---
 nanobot/agent/loop.py                           | 11 -----------
 nanobot/agent/tools/message.py                  |  4 ++--
 nanobot/skills/image-generation/SKILL.md        |  2 +-
 nanobot/templates/agent/identity.md             |  2 +-
 nanobot/utils/artifacts.py                      |  5 +++--
 tests/agent/test_context_prompt_cache.py        |  4 ++--
 tests/agent/test_loop_image_generation_media.py | 12 +++++-------
 tests/utils/test_artifacts.py                   |  4 ++--
 8 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index c1f521170..6f3926120 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -36,14 +36,12 @@ from nanobot.session.goal_state import (
     runner_wall_llm_timeout_s,
 )
 from nanobot.session.manager import Session, SessionManager
-from nanobot.utils.artifacts import generated_image_paths_from_messages
 from nanobot.utils.document import extract_documents
 from nanobot.utils.helpers import image_placeholder_text
 from nanobot.utils.helpers import truncate_text as truncate_text_fn
 from nanobot.utils.image_generation_intent import image_generation_prompt
 from nanobot.utils.llm_runtime import LLMRuntime
 from nanobot.utils.runtime import EMPTY_FINAL_RESPONSE_MESSAGE
-from nanobot.utils.session_attachments import merge_turn_media_into_last_assistant
 from nanobot.utils.webui_turn_helpers import (
     WebuiTurnCoordinator,
     build_bus_progress_callback,
@@ -103,7 +101,6 @@ class TurnContext:
     save_skip: int = 0
 
     outbound: OutboundMessage | None = None
-    generated_media: list[str] = field(default_factory=list)
 
     on_progress: Callable[..., Awaitable[None]] | None = None
     on_stream: Callable[[str], Awaitable[None]] | None = None
@@ -1194,7 +1191,6 @@ class AgentLoop:
         all_msgs: list[dict[str, Any]],
         stop_reason: str,
         had_injections: bool,
-        generated_media: list[str],
         on_stream: Callable[[str], Awaitable[None]] | None,
         *,
         turn_latency_ms: int | None = None,
@@ -1218,7 +1214,6 @@ class AgentLoop:
             channel=msg.channel,
             chat_id=msg.chat_id,
             content=final_content,
-            media=generated_media,
             metadata=meta,
         )
 
@@ -1348,11 +1343,6 @@ class AgentLoop:
             ctx.final_content = EMPTY_FINAL_RESPONSE_MESSAGE
 
         ctx.save_skip = 1 + len(ctx.history) + (1 if ctx.user_persisted_early else 0)
-        skip_msgs = ctx.all_messages[ctx.save_skip:]
-        ctx.generated_media = generated_image_paths_from_messages(skip_msgs)
-        mt = self.tools.get("message")
-        extra = getattr(mt, "turn_delivered_media_paths", lambda: [])() if mt else []
-        merge_turn_media_into_last_assistant(ctx.all_messages, ctx.generated_media, extra)
 
         ctx.turn_latency_ms = max(0, int((time.time() - ctx.turn_wall_started_at) * 1000))
         self._save_turn(
@@ -1380,7 +1370,6 @@ class AgentLoop:
             ctx.all_messages,
             ctx.stop_reason,
             ctx.had_injections,
-            ctx.generated_media,
             ctx.on_stream,
             turn_latency_ms=ctx.turn_latency_ms,
         )
diff --git a/nanobot/agent/tools/message.py b/nanobot/agent/tools/message.py
index 725e824e5..4e2b5554d 100644
--- a/nanobot/agent/tools/message.py
+++ b/nanobot/agent/tools/message.py
@@ -140,8 +140,8 @@ class MessageTool(Tool, ContextAware):
             "Do not use this for the normal reply in the current chat: answer naturally instead. "
             "If channel/chat_id would target the current runtime conversation, do not call this tool "
             "unless the user explicitly asked you to proactively send an existing file attachment. "
-            "When generate_image creates images in the current chat, the final assistant reply "
-            "automatically attaches them; do not call message just to announce or resend them. "
+            "When generate_image creates images in the current chat, use the message tool "
+            "with the artifact paths in the media parameter to deliver the images to the user. "
             "For proactive attachment delivery, use the 'media' parameter with file paths. "
             "Do NOT use read_file to send files — that only reads content for your own analysis."
         )
diff --git a/nanobot/skills/image-generation/SKILL.md b/nanobot/skills/image-generation/SKILL.md
index 0559651f6..d50fb0648 100644
--- a/nanobot/skills/image-generation/SKILL.md
+++ b/nanobot/skills/image-generation/SKILL.md
@@ -15,7 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
 - Image editing: pass the saved artifact path or user image path in `reference_images`.
 - Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
 - Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
-- In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.
+- After generating images, call the `message` tool with the artifact paths in the `media` parameter to deliver them to the user.
 
 ## Prompt Rules
 
diff --git a/nanobot/templates/agent/identity.md b/nanobot/templates/agent/identity.md
index 6548c1def..e6fa55354 100644
--- a/nanobot/templates/agent/identity.md
+++ b/nanobot/templates/agent/identity.md
@@ -30,5 +30,5 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain
 
 Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
 When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
-Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
+Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When 'generate_image' creates images, call 'message' with the artifact paths in the 'media' parameter to deliver them to the user.
 To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])
diff --git a/nanobot/utils/artifacts.py b/nanobot/utils/artifacts.py
index eca706eed..f01e08942 100644
--- a/nanobot/utils/artifacts.py
+++ b/nanobot/utils/artifacts.py
@@ -115,8 +115,9 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
             "artifacts": artifacts,
             "next_step": (
                 "Use these artifact paths as reference_images for follow-up edits. "
-                "For the current chat, reply naturally; the runtime attaches generated images automatically. "
-                "Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
+                "Call the message tool with the artifact paths in the media parameter "
+                "to deliver the images to the user. Keep raw paths internal unless the "
+                "user asks for debug details."
             ),
         },
         ensure_ascii=False,
diff --git a/tests/agent/test_context_prompt_cache.py b/tests/agent/test_context_prompt_cache.py
index 4b6f3dadf..bbafd4890 100644
--- a/tests/agent/test_context_prompt_cache.py
+++ b/tests/agent/test_context_prompt_cache.py
@@ -314,8 +314,8 @@ def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path)
     prompt = builder.build_system_prompt(channel="slack")
 
     assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
-    assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
-    assert "do not call 'message' just to announce or resend them" in prompt
+    assert "When 'generate_image' creates images" in prompt
+    assert "call 'message' with the artifact paths in the 'media' parameter" in prompt
     assert "Wait for the tool results, then answer once" in prompt
 
 
diff --git a/tests/agent/test_loop_image_generation_media.py b/tests/agent/test_loop_image_generation_media.py
index 73904be93..cfcc3b2cd 100644
--- a/tests/agent/test_loop_image_generation_media.py
+++ b/tests/agent/test_loop_image_generation_media.py
@@ -29,10 +29,11 @@ class FakeImageClient:
 
 
 @pytest.mark.asyncio
-async def test_generated_image_media_is_attached_to_final_assistant_message(
+async def test_outbound_no_longer_carries_generated_media(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
+    """Media delivery is now the LLM's responsibility via the message tool."""
     set_config_path(tmp_path / "config.json")
     monkeypatch.setattr(
         "nanobot.agent.tools.image_generation.get_image_gen_provider",
@@ -81,9 +82,6 @@ async def test_generated_image_media_is_attached_to_final_assistant_message(
 
     assert result is not None
     assert result.content == "Done"
-    assert len(result.media) == 1
-    assert Path(result.media[0]).is_file()
-
-    session = loop.sessions.get_or_create("websocket:chat-image")
-    assert session.messages[-1]["role"] == "assistant"
-    assert session.messages[-1]["media"] == result.media
+    # OutboundMessage no longer carries generated media —
+    # the LLM sends images via the message tool instead.
+    assert result.media == []
diff --git a/tests/utils/test_artifacts.py b/tests/utils/test_artifacts.py
index 64d2e3f32..54c9b222a 100644
--- a/tests/utils/test_artifacts.py
+++ b/tests/utils/test_artifacts.py
@@ -83,5 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
             {"role": "tool", "name": "other", "content": result},
         ]
     ) == ["/tmp/one.png", "/tmp/two.png"]
-    assert "runtime attaches generated images automatically" in payload["next_step"]
-    assert "Do not call message" in payload["next_step"]
+    assert "Call the message tool" in payload["next_step"]
+    assert "media parameter" in payload["next_step"]