fix(image-generation): let LLM deliver images via message tool instead of runtime media attachment

The runtime media-attachment mechanism was broken for streaming channels (e.g. WebSocket): the _streamed flag caused _send_once to skip the final OutboundMessage that carried generated media, so images were never delivered. Rather than adding complex coordination between streaming and media delivery, delegate image delivery to the LLM: after generate_image returns artifact paths, the next_step prompt now instructs the LLM to call the message tool with the paths in the media parameter. This works uniformly across all channels, streaming or not. Remove generated_media from TurnContext, _assemble_outbound, and _state_save. Update prompts in identity.md, SKILL.md, message tool description, and artifacts.py to reflect the new flow.
2026-05-19 16:12:30 +00:00 · 2026-05-19 00:42:56 +08:00 · 2026-05-19 00:42:56 +08:00 · 86858cfcb8
commit 86858cfcb8
parent b518cd5850
8 changed files with 16 additions and 28 deletions
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@ -36,14 +36,12 @@ from nanobot.session.goal_state import (
    runner_wall_llm_timeout_s,
 )
 from nanobot.session.manager import Session, SessionManager
-from nanobot.utils.artifacts import generated_image_paths_from_messages
 from nanobot.utils.document import extract_documents
 from nanobot.utils.helpers import image_placeholder_text
 from nanobot.utils.helpers import truncate_text as truncate_text_fn
 from nanobot.utils.image_generation_intent import image_generation_prompt
 from nanobot.utils.llm_runtime import LLMRuntime
 from nanobot.utils.runtime import EMPTY_FINAL_RESPONSE_MESSAGE
-from nanobot.utils.session_attachments import merge_turn_media_into_last_assistant
 from nanobot.utils.webui_turn_helpers import (
    WebuiTurnCoordinator,
    build_bus_progress_callback,
@ -103,7 +101,6 @@ class TurnContext:
    save_skip: int = 0

    outbound: OutboundMessage | None = None
-    generated_media: list[str] = field(default_factory=list)

    on_progress: Callable[..., Awaitable[None]] | None = None
    on_stream: Callable[[str], Awaitable[None]] | None = None
@ -1194,7 +1191,6 @@ class AgentLoop:
        all_msgs: list[dict[str, Any]],
        stop_reason: str,
        had_injections: bool,
-        generated_media: list[str],
        on_stream: Callable[[str], Awaitable[None]] | None,
        *,
        turn_latency_ms: int | None = None,
@ -1218,7 +1214,6 @@ class AgentLoop:
            channel=msg.channel,
            chat_id=msg.chat_id,
            content=final_content,
-            media=generated_media,
            metadata=meta,
        )

@ -1348,11 +1343,6 @@ class AgentLoop:
            ctx.final_content = EMPTY_FINAL_RESPONSE_MESSAGE

        ctx.save_skip = 1 + len(ctx.history) + (1 if ctx.user_persisted_early else 0)
-        skip_msgs = ctx.all_messages[ctx.save_skip:]
-        ctx.generated_media = generated_image_paths_from_messages(skip_msgs)
-        mt = self.tools.get("message")
-        extra = getattr(mt, "turn_delivered_media_paths", lambda: [])() if mt else []
-        merge_turn_media_into_last_assistant(ctx.all_messages, ctx.generated_media, extra)

        ctx.turn_latency_ms = max(0, int((time.time() - ctx.turn_wall_started_at) * 1000))
        self._save_turn(
@ -1380,7 +1370,6 @@ class AgentLoop:
            ctx.all_messages,
            ctx.stop_reason,
            ctx.had_injections,
-            ctx.generated_media,
            ctx.on_stream,
            turn_latency_ms=ctx.turn_latency_ms,
        )
--- a/nanobot/agent/tools/message.py
+++ b/nanobot/agent/tools/message.py
@ -140,8 +140,8 @@ class MessageTool(Tool, ContextAware):
            "Do not use this for the normal reply in the current chat: answer naturally instead. "
            "If channel/chat_id would target the current runtime conversation, do not call this tool "
            "unless the user explicitly asked you to proactively send an existing file attachment. "
-            "When generate_image creates images in the current chat, the final assistant reply "
-            "automatically attaches them; do not call message just to announce or resend them. "
+            "When generate_image creates images in the current chat, use the message tool "
+            "with the artifact paths in the media parameter to deliver the images to the user. "
            "For proactive attachment delivery, use the 'media' parameter with file paths. "
            "Do NOT use read_file to send files — that only reads content for your own analysis."
        )
--- a/nanobot/skills/image-generation/SKILL.md
+++ b/nanobot/skills/image-generation/SKILL.md
@ -15,7 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
 - Image editing: pass the saved artifact path or user image path in `reference_images`.
 - Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
 - Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
- In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.
+- After generating images, call the `message` tool with the artifact paths in the `media` parameter to deliver them to the user.

 ## Prompt Rules

--- a/nanobot/templates/agent/identity.md
+++ b/nanobot/templates/agent/identity.md
@ -30,5 +30,5 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain

 Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
 When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
-Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
+Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When 'generate_image' creates images, call 'message' with the artifact paths in the 'media' parameter to deliver them to the user.
 To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])
--- a/nanobot/utils/artifacts.py
+++ b/nanobot/utils/artifacts.py
@ -115,8 +115,9 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
            "artifacts": artifacts,
            "next_step": (
                "Use these artifact paths as reference_images for follow-up edits. "
-                "For the current chat, reply naturally; the runtime attaches generated images automatically. "
-                "Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
+                "Call the message tool with the artifact paths in the media parameter "
+                "to deliver the images to the user. Keep raw paths internal unless the "
+                "user asks for debug details."
            ),
        },
        ensure_ascii=False,
--- a/tests/agent/test_context_prompt_cache.py
+++ b/tests/agent/test_context_prompt_cache.py
@ -314,8 +314,8 @@ def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path)
    prompt = builder.build_system_prompt(channel="slack")

    assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
-    assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
-    assert "do not call 'message' just to announce or resend them" in prompt
+    assert "When 'generate_image' creates images" in prompt
+    assert "call 'message' with the artifact paths in the 'media' parameter" in prompt
    assert "Wait for the tool results, then answer once" in prompt


--- a/tests/agent/test_loop_image_generation_media.py
+++ b/tests/agent/test_loop_image_generation_media.py
@ -29,10 +29,11 @@ class FakeImageClient:


@pytest.mark.asyncio
-async def test_generated_image_media_is_attached_to_final_assistant_message(
+async def test_outbound_no_longer_carries_generated_media(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
+    """Media delivery is now the LLM's responsibility via the message tool."""
    set_config_path(tmp_path / "config.json")
    monkeypatch.setattr(
        "nanobot.agent.tools.image_generation.get_image_gen_provider",
@ -81,9 +82,6 @@ async def test_generated_image_media_is_attached_to_final_assistant_message(

    assert result is not None
    assert result.content == "Done"
-    assert len(result.media) == 1
-    assert Path(result.media[0]).is_file()
-
-    session = loop.sessions.get_or_create("websocket:chat-image")
-    assert session.messages[-1]["role"] == "assistant"
-    assert session.messages[-1]["media"] == result.media
+    # OutboundMessage no longer carries generated media —
+    # the LLM sends images via the message tool instead.
+    assert result.media == []
--- a/tests/utils/test_artifacts.py
+++ b/tests/utils/test_artifacts.py
@ -83,5 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
            {"role": "tool", "name": "other", "content": result},
        ]
    ) == ["/tmp/one.png", "/tmp/two.png"]
-    assert "runtime attaches generated images automatically" in payload["next_step"]
-    assert "Do not call message" in payload["next_step"]
+    assert "Call the message tool" in payload["next_step"]
+    assert "media parameter" in payload["next_step"]