fix(image-generation): let LLM deliver images via message tool instead of runtime media attachment

The runtime media-attachment mechanism was broken for streaming channels
(e.g. WebSocket): the _streamed flag caused _send_once to skip the final
OutboundMessage that carried generated media, so images were never delivered.

Rather than adding complex coordination between streaming and media delivery,
delegate image delivery to the LLM: after generate_image returns artifact
paths, the next_step prompt now instructs the LLM to call the message tool
with the paths in the media parameter. This works uniformly across all
channels, streaming or not.

Remove generated_media from TurnContext, _assemble_outbound, and _state_save.
Update prompts in identity.md, SKILL.md, message tool description, and
artifacts.py to reflect the new flow.
This commit is contained in:
chengyongru 2026-05-19 00:42:56 +08:00 committed by Xubin Ren
parent 99e4d25d4c
commit fc1c8ea770
8 changed files with 16 additions and 28 deletions

View File

@ -36,14 +36,12 @@ from nanobot.session.goal_state import (
runner_wall_llm_timeout_s,
)
from nanobot.session.manager import Session, SessionManager
from nanobot.utils.artifacts import generated_image_paths_from_messages
from nanobot.utils.document import extract_documents
from nanobot.utils.helpers import image_placeholder_text
from nanobot.utils.helpers import truncate_text as truncate_text_fn
from nanobot.utils.image_generation_intent import image_generation_prompt
from nanobot.utils.llm_runtime import LLMRuntime
from nanobot.utils.runtime import EMPTY_FINAL_RESPONSE_MESSAGE
from nanobot.utils.session_attachments import merge_turn_media_into_last_assistant
from nanobot.utils.webui_turn_helpers import (
WebuiTurnCoordinator,
build_bus_progress_callback,
@ -103,7 +101,6 @@ class TurnContext:
save_skip: int = 0
outbound: OutboundMessage | None = None
generated_media: list[str] = field(default_factory=list)
on_progress: Callable[..., Awaitable[None]] | None = None
on_stream: Callable[[str], Awaitable[None]] | None = None
@ -1194,7 +1191,6 @@ class AgentLoop:
all_msgs: list[dict[str, Any]],
stop_reason: str,
had_injections: bool,
generated_media: list[str],
on_stream: Callable[[str], Awaitable[None]] | None,
*,
turn_latency_ms: int | None = None,
@ -1218,7 +1214,6 @@ class AgentLoop:
channel=msg.channel,
chat_id=msg.chat_id,
content=final_content,
media=generated_media,
metadata=meta,
)
@ -1348,11 +1343,6 @@ class AgentLoop:
ctx.final_content = EMPTY_FINAL_RESPONSE_MESSAGE
ctx.save_skip = 1 + len(ctx.history) + (1 if ctx.user_persisted_early else 0)
skip_msgs = ctx.all_messages[ctx.save_skip:]
ctx.generated_media = generated_image_paths_from_messages(skip_msgs)
mt = self.tools.get("message")
extra = getattr(mt, "turn_delivered_media_paths", lambda: [])() if mt else []
merge_turn_media_into_last_assistant(ctx.all_messages, ctx.generated_media, extra)
ctx.turn_latency_ms = max(0, int((time.time() - ctx.turn_wall_started_at) * 1000))
self._save_turn(
@ -1380,7 +1370,6 @@ class AgentLoop:
ctx.all_messages,
ctx.stop_reason,
ctx.had_injections,
ctx.generated_media,
ctx.on_stream,
turn_latency_ms=ctx.turn_latency_ms,
)

View File

@ -140,8 +140,8 @@ class MessageTool(Tool, ContextAware):
"Do not use this for the normal reply in the current chat: answer naturally instead. "
"If channel/chat_id would target the current runtime conversation, do not call this tool "
"unless the user explicitly asked you to proactively send an existing file attachment. "
"When generate_image creates images in the current chat, the final assistant reply "
"automatically attaches them; do not call message just to announce or resend them. "
"When generate_image creates images in the current chat, use the message tool "
"with the artifact paths in the media parameter to deliver the images to the user. "
"For proactive attachment delivery, use the 'media' parameter with file paths. "
"Do NOT use read_file to send files — that only reads content for your own analysis."
)

View File

@ -15,7 +15,7 @@ If the `generate_image` tool is not available in the current tool list, tell the
- Image editing: pass the saved artifact path or user image path in `reference_images`.
- Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version".
- Ambiguous edits: ask a short clarifying question if multiple recent images could be the target.
- In the current chat, do not call `message` just to announce or resend generated images. The runtime attaches images from `generate_image` to the final assistant reply automatically.
- After generating images, call the `message` tool with the artifact paths in the `media` parameter to deliver them to the user.
## Prompt Rules

View File

@ -30,5 +30,5 @@ Output is rendered in a terminal. Avoid markdown headings and tables. Use plain
Reply directly with text for the current conversation. Do not use the 'message' tool for normal replies in the current chat.
When you need to call tools before answering, do not include the final user-visible answer in the same assistant message as the tool calls. Wait for the tool results, then answer once.
Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When a tool such as 'generate_image' creates user-visible media, the runtime attaches those artifacts to the final assistant reply automatically, so do not call 'message' just to announce or resend them.
Use the 'message' tool only for proactive sends, cross-channel delivery, or explicitly sending existing local files as attachments. When 'generate_image' creates images, call 'message' with the artifact paths in the 'media' parameter to deliver them to the user.
To send an existing local file that was not automatically attached by another tool, call 'message' with the 'media' parameter. Do NOT use read_file to "send" a file — reading a file only shows its content to you, it does NOT deliver the file to the user. Example: message(content="Here is the document", channel="telegram", chat_id="...", media=["/path/to/file.pdf"])

View File

@ -115,8 +115,9 @@ def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str:
"artifacts": artifacts,
"next_step": (
"Use these artifact paths as reference_images for follow-up edits. "
"For the current chat, reply naturally; the runtime attaches generated images automatically. "
"Do not call message just to announce or resend them. Keep raw paths internal unless the user asks for debug details."
"Call the message tool with the artifact paths in the media parameter "
"to deliver the images to the user. Keep raw paths internal unless the "
"user asks for debug details."
),
},
ensure_ascii=False,

View File

@ -314,8 +314,8 @@ def test_system_prompt_keeps_message_tool_out_of_current_chat_replies(tmp_path)
prompt = builder.build_system_prompt(channel="slack")
assert "Do not use the 'message' tool for normal replies in the current chat" in prompt
assert "the runtime attaches those artifacts to the final assistant reply automatically" in prompt
assert "do not call 'message' just to announce or resend them" in prompt
assert "When 'generate_image' creates images" in prompt
assert "call 'message' with the artifact paths in the 'media' parameter" in prompt
assert "Wait for the tool results, then answer once" in prompt

View File

@ -29,10 +29,11 @@ class FakeImageClient:
@pytest.mark.asyncio
async def test_generated_image_media_is_attached_to_final_assistant_message(
async def test_outbound_no_longer_carries_generated_media(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Media delivery is now the LLM's responsibility via the message tool."""
set_config_path(tmp_path / "config.json")
monkeypatch.setattr(
"nanobot.agent.tools.image_generation.get_image_gen_provider",
@ -81,9 +82,6 @@ async def test_generated_image_media_is_attached_to_final_assistant_message(
assert result is not None
assert result.content == "Done"
assert len(result.media) == 1
assert Path(result.media[0]).is_file()
session = loop.sessions.get_or_create("websocket:chat-image")
assert session.messages[-1]["role"] == "assistant"
assert session.messages[-1]["media"] == result.media
# OutboundMessage no longer carries generated media —
# the LLM sends images via the message tool instead.
assert result.media == []

View File

@ -83,5 +83,5 @@ def test_generated_image_paths_from_tool_results() -> None:
{"role": "tool", "name": "other", "content": result},
]
) == ["/tmp/one.png", "/tmp/two.png"]
assert "runtime attaches generated images automatically" in payload["next_step"]
assert "Do not call message" in payload["next_step"]
assert "Call the message tool" in payload["next_step"]
assert "media parameter" in payload["next_step"]