From e936ed48bd09fe0ab7e3eadcb56d5533c0873800 Mon Sep 17 00:00:00 2001 From: Xubin Ren Date: Fri, 8 May 2026 09:40:15 +0000 Subject: [PATCH] feat: add image generation tool and WebUI mode Co-authored-by: Cursor --- docs/README.md | 1 + docs/configuration.md | 6 + docs/image-generation.md | 200 +++++++++ nanobot/agent/loop.py | 38 +- nanobot/agent/tools/image_generation.py | 192 +++++++++ nanobot/agent/tools/message.py | 2 +- nanobot/channels/websocket.py | 16 +- nanobot/cli/commands.py | 14 +- nanobot/config/schema.py | 13 + nanobot/nanobot.py | 4 + nanobot/providers/image_generation.py | 395 ++++++++++++++++++ nanobot/skills/image-generation/SKILL.md | 109 +++++ nanobot/utils/artifacts.py | 161 +++++++ nanobot/utils/image_generation_intent.py | 27 ++ .../agent/test_loop_image_generation_media.py | 89 ++++ tests/providers/test_image_generation.py | 204 +++++++++ tests/tools/test_image_generation_tool.py | 154 +++++++ tests/tools/test_message_tool.py | 19 + tests/utils/test_artifacts.py | 84 ++++ tests/utils/test_image_generation_intent.py | 25 ++ webui/src/App.tsx | 52 ++- webui/src/components/MessageBubble.tsx | 17 +- .../src/components/settings/SettingsView.tsx | 18 + .../src/components/thread/ThreadComposer.tsx | 225 +++++++++- webui/src/components/thread/ThreadShell.tsx | 73 +++- webui/src/hooks/useNanobotStream.ts | 94 +++-- webui/src/i18n/locales/en/common.json | 50 +++ webui/src/i18n/locales/es/common.json | 50 +++ webui/src/i18n/locales/fr/common.json | 50 +++ webui/src/i18n/locales/id/common.json | 50 +++ webui/src/i18n/locales/ja/common.json | 50 +++ webui/src/i18n/locales/ko/common.json | 50 +++ webui/src/i18n/locales/vi/common.json | 50 +++ webui/src/i18n/locales/zh-CN/common.json | 50 +++ webui/src/i18n/locales/zh-TW/common.json | 50 +++ webui/src/lib/api.ts | 16 +- webui/src/lib/nanobot-client.ts | 20 +- webui/src/lib/types.ts | 6 + webui/src/tests/api.test.ts | 12 + webui/src/tests/i18n.test.tsx | 6 + webui/src/tests/message-bubble.test.tsx | 22 + webui/src/tests/nanobot-client.test.ts | 27 ++ webui/src/tests/thread-composer.test.tsx | 112 +++++ webui/src/tests/thread-shell.test.tsx | 82 ++++ webui/src/tests/useNanobotStream.test.tsx | 83 ++++ 45 files changed, 2979 insertions(+), 89 deletions(-) create mode 100644 docs/image-generation.md create mode 100644 nanobot/agent/tools/image_generation.py create mode 100644 nanobot/providers/image_generation.py create mode 100644 nanobot/skills/image-generation/SKILL.md create mode 100644 nanobot/utils/artifacts.py create mode 100644 nanobot/utils/image_generation_intent.py create mode 100644 tests/agent/test_loop_image_generation_media.py create mode 100644 tests/providers/test_image_generation.py create mode 100644 tests/tools/test_image_generation_tool.py create mode 100644 tests/utils/test_artifacts.py create mode 100644 tests/utils/test_image_generation_intent.py diff --git a/docs/README.md b/docs/README.md index d8ff30247..56b8dab2f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,6 +14,7 @@ Start here for setup, everyday usage, and deployment. | Chat apps | [`chat-apps.md`](./chat-apps.md) | Connect nanobot to Telegram, Discord, WeChat, and more | | Agent social network | [`agent-social-network.md`](./agent-social-network.md) | Join external agent communities from nanobot | | Configuration | [`configuration.md`](./configuration.md) | Providers, tools, channels, MCP, and runtime settings | +| Image generation | [`image-generation.md`](./image-generation.md) | Configure image providers, WebUI image mode, and generated artifacts | | Multiple instances | [`multiple-instances.md`](./multiple-instances.md) | Run isolated bots with separate configs and workspaces | | CLI reference | [`cli-reference.md`](./cli-reference.md) | Core CLI commands and common entrypoints | | In-chat commands | [`chat-commands.md`](./chat-commands.md) | Slash commands and periodic task behavior | diff --git a/docs/configuration.md b/docs/configuration.md index f5fb32cb7..01d55c20b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -915,6 +915,12 @@ If you want to always use the local conversion, you can force it using: |--------|------|---------|-------------| | `useJinaReader` | boolean | `true` | If true, Jina Reader will be preferred over the local conversion | +## Image Generation + +Image generation is configured under `tools.imageGeneration` and uses provider credentials from `providers.openrouter` or `providers.aihubmix`. + +See [Image Generation](./image-generation.md) for WebUI usage, provider examples, artifact storage, and troubleshooting. + ## MCP (Model Context Protocol) > [!TIP] diff --git a/docs/image-generation.md b/docs/image-generation.md new file mode 100644 index 000000000..5c63fddf1 --- /dev/null +++ b/docs/image-generation.md @@ -0,0 +1,200 @@ +# Image Generation + +nanobot can generate and edit images through the `generate_image` tool. In the WebUI, users can enable **Image Generation** from the composer, choose an aspect ratio, and keep iterating on generated images inside the same chat. + +The feature is disabled by default. Enable it in `~/.nanobot/config.json`, configure a supported image provider, then restart the gateway. + +## Quick Setup + +OpenRouter example: + +```json +{ + "providers": { + "openrouter": { + "apiKey": "${OPENROUTER_API_KEY}" + } + }, + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "openrouter", + "model": "openai/gpt-5.4-image-2", + "defaultAspectRatio": "1:1", + "defaultImageSize": "1K" + } + } +} +``` + +AIHubMix example: + +```json +{ + "providers": { + "aihubmix": { + "apiKey": "${AIHUBMIX_API_KEY}" + } + }, + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "aihubmix", + "model": "gpt-image-2-free", + "defaultAspectRatio": "1:1", + "defaultImageSize": "1K" + } + } +} +``` + +> [!TIP] +> Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup. + +## WebUI Usage + +In the WebUI composer: + +1. Click **Image Generation**. +2. Choose an aspect ratio: `Auto`, `1:1`, `3:4`, `9:16`, `4:3`, or `16:9`. +3. Describe the image or the edit you want. +4. Attach reference images when editing an existing image. + +Generated images are rendered as assistant media in the chat. Follow-up prompts such as "make it warmer", "change the background", or "try a 16:9 version" can reuse the most recent generated artifact. + +The WebUI hides provider storage details from the user. The agent sees the saved artifact path internally and can pass it back to `generate_image` as `reference_images` for iterative edits. + +## Configuration Reference + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool | +| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Currently `openrouter` and `aihubmix` are supported | +| `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name | +| `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one | +| `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` | +| `tools.imageGeneration.maxImagesPerTurn` | number | `4` | Maximum `count` accepted by one tool call. Valid range: `1` to `8` | +| `tools.imageGeneration.saveDir` | string | `"generated"` | Relative directory under nanobot's media directory for generated artifacts | + +Provider settings reuse normal provider config fields: + +| Option | Description | +|--------|-------------| +| `providers..apiKey` | Provider API key. Prefer `${ENV_VAR}` | +| `providers..apiBase` | Optional custom base URL | +| `providers..extraHeaders` | Headers merged into provider requests | +| `providers..extraBody` | Extra JSON fields merged into provider request bodies | + +Both camelCase and snake_case config keys are accepted, but docs use camelCase to match `config.json`. + +## Provider Notes + +### OpenRouter + +OpenRouter uses a chat-completions style image response. Configure: + +```json +{ + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "openrouter", + "model": "openai/gpt-5.4-image-2" + } + } +} +``` + +Use a model that supports image generation and image editing if you want reference-image edits. + +### AIHubMix + +AIHubMix `gpt-image-2-free` is supported through AIHubMix's unified predictions API. Internally nanobot calls: + +```text +/v1/models/openai/gpt-image-2-free/predictions +``` + +Configure: + +```json +{ + "providers": { + "aihubmix": { + "apiKey": "${AIHUBMIX_API_KEY}", + "extraBody": { + "quality": "low" + } + } + }, + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "aihubmix", + "model": "gpt-image-2-free" + } + } +} +``` + +`quality: low` is optional. It can make free image models faster and less likely to time out, but it is not required for correctness. + +## Artifacts + +Generated images are stored under the active nanobot instance's media directory: + +```text +~/.nanobot/media/generated/YYYY-MM-DD/img_. +~/.nanobot/media/generated/YYYY-MM-DD/img_.json +``` + +For non-default config locations, the media directory is relative to the active config file's directory. + +The JSON sidecar stores: + +| Field | Meaning | +|-------|---------| +| `id` | Short generated image id, such as `img_ab12cd34ef56` | +| `path` | Local image path used internally for follow-up edits | +| `mime` | Detected image MIME type | +| `prompt` | Prompt used for the generation | +| `model` | Provider model | +| `provider` | Provider name | +| `source_images` | Reference image paths used for edits | +| `created_at` | Creation timestamp | + +Do not paste base64 image payloads into chat. The agent should keep local artifact paths internal unless the user explicitly asks for debugging details. + +## Prompting + +Good image prompts include: + +- Subject and scene. +- Composition, camera, or layout. +- Style, mood, lighting, and color palette. +- Exact text that must appear in the image, quoted. +- Constraints such as "keep the same character" or "preserve the logo". + +Example: + +```text +A minimal app icon for nanobot: friendly robot head, rounded square, soft blue and white palette, clean vector style, no text +``` + +For edits, describe what should change and what must stay fixed: + +```text +Use the reference image. Keep the same robot and composition, change the palette to warm orange, and add a subtle sunrise background. +``` + +## Troubleshooting + +| Symptom | Check | +|---------|-------| +| `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway | +| Missing API key error | Configure `providers..apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process | +| `unsupported image generation provider` | Use `openrouter` or `aihubmix` | +| AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally | +| Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later | +| Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files | + diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index d1952312b..25f34e8be 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -30,6 +30,7 @@ from nanobot.agent.tools.ask import ( from nanobot.agent.tools.cron import CronTool from nanobot.agent.tools.file_state import FileStateStore, bind_file_states, reset_file_states from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool +from nanobot.agent.tools.image_generation import ImageGenerationTool from nanobot.agent.tools.message import MessageTool from nanobot.agent.tools.notebook import NotebookEditTool from nanobot.agent.tools.registry import ToolRegistry @@ -45,9 +46,11 @@ from nanobot.config.schema import AgentDefaults from nanobot.providers.base import LLMProvider from nanobot.providers.factory import ProviderSnapshot from nanobot.session.manager import Session, SessionManager +from nanobot.utils.artifacts import generated_image_paths_from_messages from nanobot.utils.document import extract_documents from nanobot.utils.helpers import image_placeholder_text from nanobot.utils.helpers import truncate_text as truncate_text_fn +from nanobot.utils.image_generation_intent import image_generation_prompt from nanobot.utils.progress_events import ( build_tool_event_finish_payloads, build_tool_event_start_payload, @@ -58,7 +61,13 @@ from nanobot.utils.runtime import EMPTY_FINAL_RESPONSE_MESSAGE from nanobot.utils.webui_titles import mark_webui_session, maybe_generate_webui_title_after_turn if TYPE_CHECKING: - from nanobot.config.schema import ChannelsConfig, ExecToolConfig, ToolsConfig, WebToolsConfig + from nanobot.config.schema import ( + ChannelsConfig, + ExecToolConfig, + ProviderConfig, + ToolsConfig, + WebToolsConfig, + ) from nanobot.cron.service import CronService @@ -215,6 +224,8 @@ class AgentLoop: unified_session: bool = False, disabled_skills: list[str] | None = None, tools_config: ToolsConfig | None = None, + image_generation_provider_config: ProviderConfig | None = None, + image_generation_provider_configs: dict[str, ProviderConfig] | None = None, provider_snapshot_loader: Callable[[], ProviderSnapshot] | None = None, provider_signature: tuple[object, ...] | None = None, ): @@ -250,6 +261,13 @@ class AgentLoop: ) self.web_config = web_config or WebToolsConfig() self.exec_config = exec_config or ExecToolConfig() + self.tools_config = _tc + self._image_generation_provider_configs = dict(image_generation_provider_configs or {}) + if ( + image_generation_provider_config is not None + and "openrouter" not in self._image_generation_provider_configs + ): + self._image_generation_provider_configs["openrouter"] = image_generation_provider_config self.cron_service = cron_service self.restrict_to_workspace = restrict_to_workspace self._start_time = time.time() @@ -404,6 +422,14 @@ class AgentLoop: user_agent=self.web_config.user_agent, ) ) + if self.tools_config.image_generation.enabled: + self.tools.register( + ImageGenerationTool( + workspace=self.workspace, + config=self.tools_config.image_generation, + provider_configs=self._image_generation_provider_configs, + ) + ) self.tools.register(MessageTool(send_callback=self.bus.publish_outbound, workspace=self.workspace)) self.tools.register(SpawnTool(manager=self.subagents)) if self.cron_service: @@ -1063,12 +1089,12 @@ class AgentLoop: self.context.build_system_prompt(channel=msg.channel), history, pending_ask_id, - msg.content, + image_generation_prompt(msg.content, msg.metadata), ) else: initial_messages = self.context.build_messages( history=history, - current_message=msg.content, + current_message=image_generation_prompt(msg.content, msg.metadata), session_summary=pending, media=msg.media if msg.media else None, channel=msg.channel, @@ -1143,6 +1169,11 @@ class AgentLoop: # Skip the already-persisted user message when saving the turn save_skip = 1 + len(history) + (1 if user_persisted_early else 0) + generated_media = generated_image_paths_from_messages(all_msgs[save_skip:]) + if generated_media and all_msgs and all_msgs[-1].get("role") == "assistant": + existing_media = all_msgs[-1].get("media") + media = existing_media if isinstance(existing_media, list) else [] + all_msgs[-1]["media"] = list(dict.fromkeys([*media, *generated_media])) self._save_turn(session, all_msgs, save_skip) session.enforce_file_cap(on_archive=self.context.memory.raw_archive) self._clear_pending_user_turn(session) @@ -1175,6 +1206,7 @@ class AgentLoop: channel=msg.channel, chat_id=msg.chat_id, content=final_content, + media=generated_media, metadata=meta, buttons=buttons, ) diff --git a/nanobot/agent/tools/image_generation.py b/nanobot/agent/tools/image_generation.py new file mode 100644 index 000000000..37a2e8740 --- /dev/null +++ b/nanobot/agent/tools/image_generation.py @@ -0,0 +1,192 @@ +"""Image generation tool.""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from nanobot.agent.tools.base import Tool, tool_parameters +from nanobot.agent.tools.schema import ( + ArraySchema, + IntegerSchema, + StringSchema, + tool_parameters_schema, +) +from nanobot.config.paths import get_media_dir +from nanobot.config.schema import ImageGenerationToolConfig +from nanobot.providers.image_generation import ( + AIHubMixImageGenerationClient, + ImageGenerationError, + OpenRouterImageGenerationClient, +) +from nanobot.utils.artifacts import ( + ArtifactError, + generated_image_tool_result, + store_generated_image_artifact, +) +from nanobot.utils.helpers import detect_image_mime + +if TYPE_CHECKING: + from nanobot.config.schema import ProviderConfig + + +@tool_parameters( + tool_parameters_schema( + prompt=StringSchema( + "Detailed image generation or edit prompt. Include style, subject, composition, colors, and constraints.", + min_length=1, + ), + reference_images=ArraySchema( + StringSchema("Local path of an existing image artifact or user-provided image to use as an edit reference."), + description="Optional local image paths. Use generated artifact paths for iterative edits.", + ), + aspect_ratio=StringSchema( + "Optional output aspect ratio, e.g. 1:1, 16:9, 9:16, 4:3.", + ), + image_size=StringSchema( + "Optional output size hint supported by the configured provider, e.g. 1K, 2K, 4K, or 1024x1024.", + ), + count=IntegerSchema( + description="Number of images to generate in this turn.", + minimum=1, + maximum=8, + ), + required=["prompt"], + ) +) +class ImageGenerationTool(Tool): + """Generate persistent image artifacts through the configured image provider.""" + + def __init__( + self, + *, + workspace: str | Path, + config: ImageGenerationToolConfig, + provider_config: ProviderConfig | None = None, + provider_configs: dict[str, ProviderConfig] | None = None, + ) -> None: + self.workspace = Path(workspace).expanduser() + self.config = config + self.provider_configs = dict(provider_configs or {}) + if provider_config is not None and "openrouter" not in self.provider_configs: + self.provider_configs["openrouter"] = provider_config + + @property + def name(self) -> str: + return "generate_image" + + @property + def description(self) -> str: + return ( + "Generate or edit images and store them as persistent artifacts. " + "Returns artifact ids and local paths. For edits, pass prior generated image paths " + "or user image paths as reference_images." + ) + + def _provider_config(self) -> ProviderConfig | None: + return self.provider_configs.get(self.config.provider) + + def _provider_client(self) -> OpenRouterImageGenerationClient | AIHubMixImageGenerationClient | None: + provider = self._provider_config() + kwargs = { + "api_key": provider.api_key if provider else None, + "api_base": provider.api_base if provider else None, + "extra_headers": provider.extra_headers if provider else None, + "extra_body": provider.extra_body if provider else None, + } + if self.config.provider == "openrouter": + return OpenRouterImageGenerationClient(**kwargs) + if self.config.provider == "aihubmix": + return AIHubMixImageGenerationClient(**kwargs) + return None + + def _missing_api_key_error(self) -> str: + provider = self.config.provider + if provider == "openrouter": + return "Error: OpenRouter API key is not configured. Set providers.openrouter.apiKey." + if provider == "aihubmix": + return "Error: AIHubMix API key is not configured. Set providers.aihubmix.apiKey." + return f"Error: {provider} API key is not configured." + + def _resolve_reference_image(self, value: str) -> str: + raw_path = Path(value).expanduser() + path = raw_path if raw_path.is_absolute() else self.workspace / raw_path + try: + resolved = path.resolve(strict=True) + except OSError as exc: + raise ImageGenerationError(f"reference image not found: {value}") from exc + + allowed_roots = [self.workspace.resolve(), get_media_dir().resolve()] + if not any(_is_relative_to(resolved, root) for root in allowed_roots): + raise ImageGenerationError( + "reference_images must be inside the workspace or nanobot media directory" + ) + if not resolved.is_file(): + raise ImageGenerationError(f"reference image is not a file: {value}") + raw = resolved.read_bytes() + if detect_image_mime(raw) is None: + raise ImageGenerationError(f"unsupported reference image: {value}") + return str(resolved) + + def _resolve_reference_images(self, values: list[str] | None) -> list[str]: + if not values: + return [] + return [self._resolve_reference_image(value) for value in values if value] + + async def execute( + self, + prompt: str, + reference_images: list[str] | None = None, + aspect_ratio: str | None = None, + image_size: str | None = None, + count: int | None = None, + **kwargs: Any, + ) -> str: + client = self._provider_client() + if client is None: + return f"Error: unsupported image generation provider '{self.config.provider}'" + provider = self._provider_config() + if not provider or not provider.api_key: + return self._missing_api_key_error() + + requested = count or 1 + if requested > self.config.max_images_per_turn: + return ( + "Error: count exceeds tools.imageGeneration.maxImagesPerTurn " + f"({self.config.max_images_per_turn})" + ) + + try: + refs = self._resolve_reference_images(reference_images) + artifacts: list[dict[str, Any]] = [] + while len(artifacts) < requested: + response = await client.generate( + prompt=prompt, + model=self.config.model, + reference_images=refs, + aspect_ratio=aspect_ratio or self.config.default_aspect_ratio, + image_size=image_size or self.config.default_image_size, + ) + for image_data_url in response.images: + artifact = store_generated_image_artifact( + image_data_url, + prompt=prompt, + model=self.config.model, + source_images=refs, + save_dir=self.config.save_dir, + provider=self.config.provider, + ) + artifacts.append(artifact) + if len(artifacts) >= requested: + break + return generated_image_tool_result(artifacts) + except (ArtifactError, ImageGenerationError, OSError) as exc: + return f"Error: {exc}" + + +def _is_relative_to(path: Path, root: Path) -> bool: + try: + path.relative_to(root) + except ValueError: + return False + return True diff --git a/nanobot/agent/tools/message.py b/nanobot/agent/tools/message.py index 6e3d037f0..62730dd6b 100644 --- a/nanobot/agent/tools/message.py +++ b/nanobot/agent/tools/message.py @@ -158,7 +158,7 @@ class MessageTool(Tool): metadata = dict(self._default_metadata.get()) if same_target else {} if message_id: metadata["message_id"] = message_id - if self._record_channel_delivery_var.get(): + if self._record_channel_delivery_var.get() or media: metadata["_record_channel_delivery"] = True msg = OutboundMessage( diff --git a/nanobot/channels/websocket.py b/nanobot/channels/websocket.py index 459276501..e79639bd9 100644 --- a/nanobot/channels/websocket.py +++ b/nanobot/channels/websocket.py @@ -1215,6 +1215,13 @@ class WebSocketChannel(BaseChannel): metadata: dict[str, Any] = {"remote": getattr(connection, "remote_address", None)} if envelope.get("webui") is True: metadata["webui"] = True + image_generation = envelope.get("image_generation") + if isinstance(image_generation, dict) and image_generation.get("enabled") is True: + aspect_ratio = image_generation.get("aspect_ratio") + metadata["image_generation"] = { + "enabled": True, + "aspect_ratio": aspect_ratio if isinstance(aspect_ratio, str) else None, + } await self._handle_message( sender_id=client_id, chat_id=cid, @@ -1258,7 +1265,14 @@ class WebSocketChannel(BaseChannel): # Snapshot the subscriber set so ConnectionClosed cleanups mid-iteration are safe. conns = list(self._subs.get(msg.chat_id, ())) if not conns: - self.logger.warning("no active subscribers for chat_id={}", msg.chat_id) + if ( + msg.metadata.get("_progress") + or msg.metadata.get("_turn_end") + or msg.metadata.get("_session_updated") + ): + self.logger.debug("no active subscribers for chat_id={}", msg.chat_id) + else: + self.logger.warning("no active subscribers for chat_id={}", msg.chat_id) return # Signal that the agent has fully finished processing the current turn. if msg.metadata.get("_turn_end"): diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py index 1f0186f1d..453f40f42 100644 --- a/nanobot/cli/commands.py +++ b/nanobot/cli/commands.py @@ -528,6 +528,7 @@ def serve( raise typer.Exit(1) from loguru import logger + from nanobot.agent.loop import AgentLoop from nanobot.api.server import create_app from nanobot.bus.queue import MessageBus @@ -571,6 +572,10 @@ def serve( consolidation_ratio=runtime_config.agents.defaults.consolidation_ratio, max_messages=runtime_config.agents.defaults.max_messages, tools_config=runtime_config.tools, + image_generation_provider_configs={ + "openrouter": runtime_config.providers.openrouter, + "aihubmix": runtime_config.providers.aihubmix, + }, ) model_name = runtime_config.agents.defaults.model @@ -696,6 +701,10 @@ def _run_gateway( consolidation_ratio=config.agents.defaults.consolidation_ratio, max_messages=config.agents.defaults.max_messages, tools_config=config.tools, + image_generation_provider_configs={ + "openrouter": config.providers.openrouter, + "aihubmix": config.providers.aihubmix, + }, provider_snapshot_loader=load_provider_snapshot, provider_signature=provider_snapshot.signature, ) @@ -735,7 +744,10 @@ def _run_gateway( ): key = session_key or _channel_session_key(msg.channel, msg.chat_id) session = session_manager.get_or_create(key) - session.add_message("assistant", msg.content, _channel_delivery=True) + extra: dict[str, Any] = {"_channel_delivery": True} + if msg.media: + extra["media"] = list(msg.media) + session.add_message("assistant", msg.content, **extra) session_manager.save(session) await bus.publish_outbound(msg) diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index aa8b0a5e5..47f2babcd 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -253,12 +253,25 @@ class MyToolConfig(Base): allow_set: bool = False # let `my` modify loop state (read-only if False) +class ImageGenerationToolConfig(Base): + """Image generation tool configuration.""" + + enabled: bool = False + provider: str = "openrouter" + model: str = "openai/gpt-5.4-image-2" + default_aspect_ratio: str = "1:1" + default_image_size: str = "1K" + max_images_per_turn: int = Field(default=4, ge=1, le=8) + save_dir: str = "generated" + + class ToolsConfig(Base): """Tools configuration.""" web: WebToolsConfig = Field(default_factory=WebToolsConfig) exec: ExecToolConfig = Field(default_factory=ExecToolConfig) my: MyToolConfig = Field(default_factory=MyToolConfig) + image_generation: ImageGenerationToolConfig = Field(default_factory=ImageGenerationToolConfig) restrict_to_workspace: bool = False # restrict all tool access to workspace directory mcp_servers: dict[str, MCPServerConfig] = Field(default_factory=dict) ssrf_whitelist: list[str] = Field(default_factory=list) # CIDR ranges to exempt from SSRF blocking (e.g. ["100.64.0.0/10"] for Tailscale) diff --git a/nanobot/nanobot.py b/nanobot/nanobot.py index 60c6dcdcb..77decc563 100644 --- a/nanobot/nanobot.py +++ b/nanobot/nanobot.py @@ -87,6 +87,10 @@ class Nanobot: session_ttl_minutes=defaults.session_ttl_minutes, consolidation_ratio=defaults.consolidation_ratio, tools_config=config.tools, + image_generation_provider_configs={ + "openrouter": config.providers.openrouter, + "aihubmix": config.providers.aihubmix, + }, ) return cls(loop) diff --git a/nanobot/providers/image_generation.py b/nanobot/providers/image_generation.py new file mode 100644 index 000000000..d1e7a1b24 --- /dev/null +++ b/nanobot/providers/image_generation.py @@ -0,0 +1,395 @@ +"""Image generation provider helpers.""" + +from __future__ import annotations + +import base64 +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import httpx + +from nanobot.providers.registry import find_by_name +from nanobot.utils.helpers import detect_image_mime + +_OPENROUTER_ATTRIBUTION_HEADERS = { + "HTTP-Referer": "https://github.com/HKUDS/nanobot", + "X-OpenRouter-Title": "nanobot", + "X-OpenRouter-Categories": "cli-agent,personal-agent", +} +_DEFAULT_TIMEOUT_S = 120.0 +_AIHUBMIX_TIMEOUT_S = 300.0 +_AIHUBMIX_ASPECT_RATIO_SIZES = { + "1:1": "1024x1024", + "3:4": "1024x1536", + "9:16": "1024x1536", + "4:3": "1536x1024", + "16:9": "1536x1024", +} + + +class ImageGenerationError(RuntimeError): + """Raised when the image generation provider cannot return images.""" + + +@dataclass(frozen=True) +class GeneratedImageResponse: + """Images and optional text returned by the provider.""" + + images: list[str] + content: str + raw: dict[str, Any] + + +def _provider_base_url(provider: str, api_base: str | None, fallback: str) -> str: + if api_base: + return api_base.rstrip("/") + spec = find_by_name(provider) + if spec and spec.default_api_base: + return spec.default_api_base.rstrip("/") + return fallback + + +def image_path_to_data_url(path: str | Path) -> str: + """Convert a local image path to an image data URL.""" + p = Path(path).expanduser() + raw = p.read_bytes() + mime = detect_image_mime(raw) + if mime is None: + raise ImageGenerationError(f"unsupported reference image: {p}") + encoded = base64.b64encode(raw).decode("ascii") + return f"data:{mime};base64,{encoded}" + + +def _b64_png_data_url(value: str) -> str: + return f"data:image/png;base64,{value}" + + +def _aihubmix_size(aspect_ratio: str | None, image_size: str | None) -> str: + """Return an OpenAI Images API size string for AIHubMix. + + The WebUI emits compact size hints like ``1K`` for OpenRouter. AIHubMix's + Images API expects OpenAI-style dimensions or ``auto``, so only pass + through explicit dimension strings and otherwise derive the closest + supported orientation from aspect ratio. + """ + if image_size and "x" in image_size.lower(): + return image_size + if aspect_ratio in _AIHUBMIX_ASPECT_RATIO_SIZES: + return _AIHUBMIX_ASPECT_RATIO_SIZES[aspect_ratio] + return "auto" + + +def _aihubmix_model_path(model: str) -> str: + if "/" in model: + return model + if model.startswith(("gpt-image-", "dall-e-")): + return f"openai/{model}" + return model + + +async def _download_image_data_url( + client: httpx.AsyncClient, + url: str, +) -> str: + response = await client.get(url) + try: + response.raise_for_status() + except httpx.HTTPStatusError as exc: + detail = response.text[:500] + raise ImageGenerationError(f"failed to download generated image: {detail}") from exc + raw = response.content + mime = detect_image_mime(raw) + if mime is None: + raise ImageGenerationError("generated image URL did not return a supported image") + encoded = base64.b64encode(raw).decode("ascii") + return f"data:{mime};base64,{encoded}" + + +class OpenRouterImageGenerationClient: + """Small async client for OpenRouter Chat Completions image generation.""" + + def __init__( + self, + *, + api_key: str | None, + api_base: str | None = None, + extra_headers: dict[str, str] | None = None, + extra_body: dict[str, Any] | None = None, + timeout: float = _DEFAULT_TIMEOUT_S, + client: httpx.AsyncClient | None = None, + ) -> None: + self.api_key = api_key + self.api_base = _provider_base_url( + "openrouter", + api_base, + "https://openrouter.ai/api/v1", + ) + self.extra_headers = extra_headers or {} + self.extra_body = extra_body or {} + self.timeout = timeout + self._client = client + + async def generate( + self, + *, + prompt: str, + model: str, + reference_images: list[str] | None = None, + aspect_ratio: str | None = None, + image_size: str | None = None, + ) -> GeneratedImageResponse: + if not self.api_key: + raise ImageGenerationError( + "OpenRouter API key is not configured. Set providers.openrouter.apiKey." + ) + + content: str | list[dict[str, Any]] + references = list(reference_images or []) + if references: + blocks: list[dict[str, Any]] = [{"type": "text", "text": prompt}] + blocks.extend( + {"type": "image_url", "image_url": {"url": image_path_to_data_url(path)}} + for path in references + ) + content = blocks + else: + content = prompt + + body: dict[str, Any] = { + "model": model, + "messages": [{"role": "user", "content": content}], + "modalities": ["image", "text"], + "stream": False, + } + image_config: dict[str, str] = {} + if aspect_ratio: + image_config["aspect_ratio"] = aspect_ratio + if image_size: + image_config["image_size"] = image_size + if image_config: + body["image_config"] = image_config + body.update(self.extra_body) + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + **_OPENROUTER_ATTRIBUTION_HEADERS, + **self.extra_headers, + } + url = f"{self.api_base}/chat/completions" + + if self._client is not None: + response = await self._client.post(url, headers=headers, json=body) + else: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post(url, headers=headers, json=body) + + try: + response.raise_for_status() + except httpx.HTTPStatusError as exc: + detail = response.text[:500] + raise ImageGenerationError(f"OpenRouter image generation failed: {detail}") from exc + + data = response.json() + images: list[str] = [] + text_parts: list[str] = [] + for choice in data.get("choices") or []: + if not isinstance(choice, dict): + continue + message = choice.get("message") or {} + if isinstance(message.get("content"), str): + text_parts.append(message["content"]) + for image in message.get("images") or []: + if not isinstance(image, dict): + continue + image_url = image.get("image_url") or image.get("imageUrl") or {} + url_value = image_url.get("url") if isinstance(image_url, dict) else None + if isinstance(url_value, str) and url_value.startswith("data:image/"): + images.append(url_value) + + if not images: + provider_error = data.get("error") if isinstance(data, dict) else None + if provider_error: + raise ImageGenerationError(f"OpenRouter returned no images: {provider_error}") + raise ImageGenerationError("OpenRouter returned no images for this request") + + return GeneratedImageResponse( + images=images, + content="\n".join(part for part in text_parts if part).strip(), + raw=data, + ) + + +class AIHubMixImageGenerationClient: + """Small async client for AIHubMix unified image generation.""" + + def __init__( + self, + *, + api_key: str | None, + api_base: str | None = None, + extra_headers: dict[str, str] | None = None, + extra_body: dict[str, Any] | None = None, + timeout: float = _AIHUBMIX_TIMEOUT_S, + client: httpx.AsyncClient | None = None, + ) -> None: + self.api_key = api_key + self.api_base = _provider_base_url( + "aihubmix", + api_base, + "https://aihubmix.com/v1", + ) + self.extra_headers = extra_headers or {} + self.extra_body = extra_body or {} + self.timeout = timeout + self._client = client + + async def generate( + self, + *, + prompt: str, + model: str, + reference_images: list[str] | None = None, + aspect_ratio: str | None = None, + image_size: str | None = None, + ) -> GeneratedImageResponse: + if not self.api_key: + raise ImageGenerationError( + "AIHubMix API key is not configured. Set providers.aihubmix.apiKey." + ) + + refs = list(reference_images or []) + headers = { + "Authorization": f"Bearer {self.api_key}", + **self.extra_headers, + } + size = _aihubmix_size(aspect_ratio, image_size) + + if self._client is not None: + return await self._generate_with_client( + self._client, + prompt=prompt, + model=model, + reference_images=refs, + size=size, + headers=headers, + ) + async with httpx.AsyncClient(timeout=self.timeout) as client: + return await self._generate_with_client( + client, + prompt=prompt, + model=model, + reference_images=refs, + size=size, + headers=headers, + ) + + async def _generate_with_client( + self, + client: httpx.AsyncClient, + *, + prompt: str, + model: str, + reference_images: list[str], + size: str, + headers: dict[str, str], + ) -> GeneratedImageResponse: + image_input: str | list[str] | None = None + if reference_images: + image_refs = [image_path_to_data_url(path) for path in reference_images] + image_input = image_refs[0] if len(image_refs) == 1 else image_refs + + input_body: dict[str, Any] = { + "prompt": prompt, + "n": 1, + "size": size, + } + if image_input is not None: + input_body["image"] = image_input + input_body.update(self.extra_body) + + body = {"input": input_body} + model_path = _aihubmix_model_path(model) + url = f"{self.api_base}/models/{model_path}/predictions" + try: + response = await client.post( + url, + headers={**headers, "Content-Type": "application/json"}, + json=body, + ) + except httpx.TimeoutException as exc: + raise ImageGenerationError("AIHubMix image generation timed out") from exc + except httpx.RequestError as exc: + raise ImageGenerationError(f"AIHubMix image generation request failed: {exc}") from exc + + try: + response.raise_for_status() + except httpx.HTTPStatusError as exc: + detail = response.text[:500] + raise ImageGenerationError(f"AIHubMix image generation failed: {detail}") from exc + + payload = response.json() + images = await _aihubmix_images_from_payload(client, payload) + + if not images: + provider_error = payload.get("error") if isinstance(payload, dict) else None + if provider_error: + raise ImageGenerationError(f"AIHubMix returned no images: {provider_error}") + raise ImageGenerationError("AIHubMix returned no images for this request") + + return GeneratedImageResponse(images=images, content="", raw=payload) + + +async def _aihubmix_images_from_payload( + client: httpx.AsyncClient, + payload: dict[str, Any], +) -> list[str]: + images: list[str] = [] + candidates: list[Any] = [] + if "data" in payload: + candidates.append(payload["data"]) + if "output" in payload: + candidates.append(payload["output"]) + + async def collect(value: Any) -> None: + if isinstance(value, list): + for item in value: + await collect(item) + return + if isinstance(value, str): + if value.startswith("data:image/"): + images.append(value) + elif value.startswith(("http://", "https://")): + images.append(await _download_image_data_url(client, value)) + return + if not isinstance(value, dict): + return + + b64_json = value.get("b64_json") + if isinstance(b64_json, str) and b64_json: + images.append(_b64_png_data_url(b64_json)) + elif b64_json is not None: + await collect(b64_json) + + bytes_base64 = value.get("bytesBase64") or value.get("bytes_base64") or value.get("base64") + if isinstance(bytes_base64, str) and bytes_base64: + images.append(_b64_png_data_url(bytes_base64)) + + image_url = value.get("image_url") or value.get("imageUrl") + if isinstance(image_url, dict): + await collect(image_url.get("url")) + elif image_url is not None: + await collect(image_url) + + url_value = value.get("url") + if url_value is not None: + await collect(url_value) + + for key in ("images", "image", "output"): + if key in value: + await collect(value[key]) + + for candidate in candidates: + await collect(candidate) + return images diff --git a/nanobot/skills/image-generation/SKILL.md b/nanobot/skills/image-generation/SKILL.md new file mode 100644 index 000000000..dc89372ee --- /dev/null +++ b/nanobot/skills/image-generation/SKILL.md @@ -0,0 +1,109 @@ +--- +name: image-generation +description: Generate images and iteratively edit saved image artifacts. +--- + +# Image Generation + +Use the `generate_image` tool when the user asks you to create, render, draw, design, generate, or edit an image. + +If the `generate_image` tool is not available in the current tool list, tell the user that image generation is not enabled for this nanobot instance. + +## When To Use + +- Text-to-image: call `generate_image` with a concrete `prompt`. +- Image editing: pass the saved artifact path or user image path in `reference_images`. +- Iterative edits in the same conversation: prefer the most recent generated image artifact if the user says things like "make it brighter", "change the background", or "try another version". +- Ambiguous edits: ask a short clarifying question if multiple recent images could be the target. + +## Prompt Rules + +Write prompts with enough detail for image models: + +- Subject and scene. +- Composition and camera or layout. +- Style, mood, lighting, and color palette. +- Text that must appear in the image, quoted exactly. +- Constraints such as "keep the same character", "preserve the logo", or "do not change the background". + +## Artifact Rules + +The tool stores generated images as persistent artifacts under nanobot's media directory and returns structured metadata: + +- `id`: generated image id, such as `img_ab12cd34ef56`. +- `path`: local file path for internal follow-up edits. +- `mime`: image MIME type. +- `prompt`, `model`, and `source_images`: provenance for follow-up edits. + +In normal user-facing replies, do not expose local filesystem paths. Keep the reply natural, for example "Done, I generated it." You may include the short image `id` when it helps the user refer to a specific image, but keep raw `path` internal unless the user explicitly asks for debug details or a local artifact reference. Never paste base64. + +For follow-up edits, pass the prior artifact `path` to `reference_images`. If the user provides a new uploaded image, use that path as the reference instead. + +## Provider Notes + +Do not ask users to paste API keys into chat. If configuration is needed, describe the fields and remind them to restart the gateway after changing config. + +For OpenRouter, the image tool expects: + +```json +{ + "providers": { + "openrouter": { + "apiKey": "sk-or-..." + } + }, + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "openrouter", + "model": "openai/gpt-5.4-image-2" + } + } +} +``` + +For AIHubMix, the image tool expects: + +```json +{ + "providers": { + "aihubmix": { + "apiKey": "sk-..." + } + }, + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "aihubmix", + "model": "gpt-image-2-free" + } + } +} +``` + +AIHubMix `gpt-image-2-free` uses AIHubMix's unified predictions endpoint internally (`/v1/models/openai/gpt-image-2-free/predictions`), not the OpenAI Images `/v1/images/generations` endpoint. If it fails with "Incorrect model ID", do not assume the key lacks permission until the provider config, model name, and gateway restart have been checked. + +`providers.aihubmix.extraBody` can be used for provider-specific options. For example, `"extraBody": {"quality": "low"}` is optional but can make `gpt-image-2-free` faster and less likely to time out. + +## Examples + +Generate a new image: + +```text +generate_image( + prompt="A minimal app icon for nanobot: friendly robot head, rounded square, soft blue and white palette, clean vector style, no text", + aspect_ratio="1:1", + image_size="1K" +) +``` + +Edit the latest generated artifact: + +```text +generate_image( + prompt="Use the reference image. Keep the same robot and composition, but change the palette to warm orange and add a subtle sunrise background.", + reference_images=["/home/user/.nanobot/media/generated/2026-05-08/img_ab12cd34ef56.png"], + aspect_ratio="1:1", + image_size="1K" +) +``` diff --git a/nanobot/utils/artifacts.py b/nanobot/utils/artifacts.py new file mode 100644 index 000000000..34802a7ac --- /dev/null +++ b/nanobot/utils/artifacts.py @@ -0,0 +1,161 @@ +"""Artifact persistence helpers for generated media.""" + +from __future__ import annotations + +import base64 +import binascii +import json +import re +import uuid +from datetime import datetime +from pathlib import Path, PurePosixPath +from typing import Any + +from nanobot.config.paths import get_media_dir +from nanobot.utils.helpers import detect_image_mime, ensure_dir + +_DATA_IMAGE_RE = re.compile(r"^data:(image/[A-Za-z0-9.+-]+);base64,(.*)$", re.DOTALL) +_MIME_EXTENSIONS = { + "image/png": ".png", + "image/jpeg": ".jpg", + "image/webp": ".webp", + "image/gif": ".gif", +} +_GENERATE_IMAGE_TOOL_NAME = "generate_image" + + +class ArtifactError(ValueError): + """Raised when an artifact cannot be safely decoded or stored.""" + + +def decode_image_data_url(data_url: str) -> tuple[bytes, str]: + """Decode a base64 image data URL and return ``(bytes, mime)``.""" + match = _DATA_IMAGE_RE.match(data_url.strip()) + if match is None: + raise ArtifactError("expected a base64 image data URL") + + declared_mime, encoded = match.groups() + try: + raw = base64.b64decode(encoded, validate=True) + except binascii.Error as exc: + raise ArtifactError("invalid base64 image payload") from exc + + detected_mime = detect_image_mime(raw) + if detected_mime is None: + raise ArtifactError("unsupported or unrecognized image data") + if declared_mime != detected_mime: + declared_mime = detected_mime + return raw, declared_mime + + +def _safe_relative_dir(save_dir: str) -> Path: + normalized = save_dir.replace("\\", "/").strip("/") + if not normalized: + raise ArtifactError("save_dir must not be empty") + rel = PurePosixPath(normalized) + if rel.is_absolute() or any(part in {"", ".", ".."} for part in rel.parts): + raise ArtifactError("save_dir must be a safe relative path") + return Path(*rel.parts) + + +def _artifact_root(save_dir: str) -> Path: + media_root = get_media_dir().resolve() + root = (media_root / _safe_relative_dir(save_dir)).resolve() + try: + root.relative_to(media_root) + except ValueError as exc: + raise ArtifactError("artifact directory escapes media root") from exc + return root + + +def store_generated_image_artifact( + data_url: str, + *, + prompt: str, + model: str, + source_images: list[str] | None = None, + save_dir: str = "generated", + provider: str = "openrouter", + created_at: datetime | None = None, +) -> dict[str, Any]: + """Persist a generated image and sidecar metadata under the media root.""" + raw, mime = decode_image_data_url(data_url) + ext = _MIME_EXTENSIONS.get(mime) + if ext is None: + raise ArtifactError(f"unsupported image MIME type: {mime}") + + now = created_at or datetime.now().astimezone() + day_dir = ensure_dir(_artifact_root(save_dir) / now.strftime("%Y-%m-%d")) + artifact_id = f"img_{uuid.uuid4().hex[:12]}" + image_path = day_dir / f"{artifact_id}{ext}" + metadata_path = day_dir / f"{artifact_id}.json" + + image_path.write_bytes(raw) + metadata: dict[str, Any] = { + "id": artifact_id, + "path": str(image_path), + "mime": mime, + "prompt": prompt, + "model": model, + "provider": provider, + "source_images": list(source_images or []), + "created_at": now.isoformat(), + } + metadata_path.write_text( + json.dumps(metadata, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + return metadata + + +def generated_image_tool_result(artifacts: list[dict[str, Any]]) -> str: + """Return the compact structured result exposed to the LLM.""" + return json.dumps( + { + "artifacts": artifacts, + "next_step": ( + "Use these artifact paths as reference_images for follow-up edits. " + "Mention the image id/path to the user; do not paste base64." + ), + }, + ensure_ascii=False, + ) + + +def _extract_text_payload(content: Any) -> str | None: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for block in content: + if isinstance(block, dict) and isinstance(block.get("text"), str): + parts.append(block["text"]) + return "\n".join(parts) if parts else None + return None + + +def generated_image_paths_from_messages(messages: list[dict[str, Any]]) -> list[str]: + """Collect generated image artifact paths from generate_image tool results.""" + paths: list[str] = [] + seen: set[str] = set() + for message in messages: + if message.get("role") != "tool" or message.get("name") != _GENERATE_IMAGE_TOOL_NAME: + continue + payload = _extract_text_payload(message.get("content")) + if not payload: + continue + try: + data = json.loads(payload) + except json.JSONDecodeError: + continue + artifacts = data.get("artifacts") if isinstance(data, dict) else None + if not isinstance(artifacts, list): + continue + for artifact in artifacts: + if not isinstance(artifact, dict): + continue + path = artifact.get("path") + if isinstance(path, str) and path and path not in seen: + paths.append(path) + seen.add(path) + return paths diff --git a/nanobot/utils/image_generation_intent.py b/nanobot/utils/image_generation_intent.py new file mode 100644 index 000000000..8d62e6375 --- /dev/null +++ b/nanobot/utils/image_generation_intent.py @@ -0,0 +1,27 @@ +"""Helpers for WebUI image-generation intent metadata.""" + +from __future__ import annotations + +from typing import Any + +IMAGE_GENERATION_METADATA_KEY = "image_generation" + + +def image_generation_prompt(content: str, metadata: dict[str, Any] | None) -> str: + """Decorate a user prompt when WebUI image mode is enabled.""" + raw = (metadata or {}).get(IMAGE_GENERATION_METADATA_KEY) + if not isinstance(raw, dict) or raw.get("enabled") is not True: + return content + + aspect_ratio = raw.get("aspect_ratio") + if isinstance(aspect_ratio, str) and aspect_ratio.strip(): + instruction = ( + "The user selected WebUI image generation mode. Use the generate_image tool. " + f"When calling generate_image, pass aspect_ratio={aspect_ratio!r}." + ) + else: + instruction = ( + "The user selected WebUI image generation mode. Use the generate_image tool. " + "Choose the most suitable aspect_ratio yourself from the prompt and intended use." + ) + return f"{content}\n\n[WebUI image generation instruction: {instruction}]" diff --git a/tests/agent/test_loop_image_generation_media.py b/tests/agent/test_loop_image_generation_media.py new file mode 100644 index 000000000..6c10ecb1c --- /dev/null +++ b/tests/agent/test_loop_image_generation_media.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from nanobot.agent.loop import AgentLoop +from nanobot.bus.events import InboundMessage +from nanobot.bus.queue import MessageBus +from nanobot.config.loader import set_config_path +from nanobot.config.schema import ImageGenerationToolConfig, ProviderConfig, ToolsConfig +from nanobot.providers.base import LLMResponse, ToolCallRequest +from nanobot.providers.image_generation import GeneratedImageResponse + +PNG_DATA_URL = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=" +) + + +class FakeImageClient: + def __init__(self, **kwargs: Any) -> None: + pass + + async def generate(self, **kwargs: Any) -> GeneratedImageResponse: + return GeneratedImageResponse(images=[PNG_DATA_URL], content="", raw={}) + + +@pytest.mark.asyncio +async def test_generated_image_media_is_attached_to_final_assistant_message( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + set_config_path(tmp_path / "config.json") + monkeypatch.setattr( + "nanobot.agent.tools.image_generation.OpenRouterImageGenerationClient", + FakeImageClient, + ) + provider = MagicMock() + provider.get_default_model.return_value = "test-model" + provider.generation.max_tokens = 4096 + provider.chat_with_retry = AsyncMock( + side_effect=[ + LLMResponse( + content="", + finish_reason="tool_calls", + tool_calls=[ + ToolCallRequest( + id="call_img", + name="generate_image", + arguments={"prompt": "draw a tiny icon"}, + ) + ], + ), + LLMResponse(content="Done", finish_reason="stop"), + ] + ) + provider.chat_stream_with_retry = AsyncMock() + loop = AgentLoop( + bus=MessageBus(), + provider=provider, + workspace=tmp_path, + model="test-model", + tools_config=ToolsConfig( + image_generation=ImageGenerationToolConfig(enabled=True), + ), + image_generation_provider_config=ProviderConfig(api_key="sk-or-test"), + ) + loop.consolidator.maybe_consolidate_by_tokens = AsyncMock(return_value=False) # type: ignore[method-assign] + + result = await loop._process_message( + InboundMessage( + channel="websocket", + sender_id="user", + chat_id="chat-image", + content="draw an icon", + ) + ) + + assert result is not None + assert result.content == "Done" + assert len(result.media) == 1 + assert Path(result.media[0]).is_file() + + session = loop.sessions.get_or_create("websocket:chat-image") + assert session.messages[-1]["role"] == "assistant" + assert session.messages[-1]["media"] == result.media diff --git a/tests/providers/test_image_generation.py b/tests/providers/test_image_generation.py new file mode 100644 index 000000000..8f2801d68 --- /dev/null +++ b/tests/providers/test_image_generation.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import httpx +import pytest + +from nanobot.providers.image_generation import ( + AIHubMixImageGenerationClient, + GeneratedImageResponse, + ImageGenerationError, + OpenRouterImageGenerationClient, +) + +PNG_BYTES = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + b"\x00\x00\x00\x01\x08\x04\x00\x00\x00\xb5\x1c\x0c\x02" + b"\x00\x00\x00\x0bIDATx\xdacd\xfc\xff\x1f\x00\x03\x03" + b"\x02\x00\xef\xbf\xa7\xdb\x00\x00\x00\x00IEND\xaeB`\x82" +) +PNG_DATA_URL = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=" +) + + +class FakeResponse: + def __init__( + self, + payload: dict[str, Any], + status_code: int = 200, + content: bytes = b"", + ) -> None: + self._payload = payload + self.status_code = status_code + self.text = str(payload) + self.content = content + self.request = httpx.Request("POST", "https://openrouter.ai/api/v1/chat/completions") + + def json(self) -> dict[str, Any]: + return self._payload + + def raise_for_status(self) -> None: + if self.status_code >= 400: + response = httpx.Response(self.status_code, request=self.request, text=self.text) + raise httpx.HTTPStatusError("failed", request=self.request, response=response) + + +class FakeClient: + def __init__(self, response: FakeResponse) -> None: + self.response = response + self.get_response = response + self.calls: list[dict[str, Any]] = [] + self.get_calls: list[dict[str, Any]] = [] + + async def post(self, url: str, **kwargs: Any) -> FakeResponse: + self.calls.append({"url": url, **kwargs}) + return self.response + + async def get(self, url: str, **kwargs: Any) -> FakeResponse: + self.get_calls.append({"url": url, **kwargs}) + return self.get_response + + +@pytest.mark.asyncio +async def test_openrouter_image_generation_payload_and_response(tmp_path: Path) -> None: + ref = tmp_path / "ref.png" + ref.write_bytes(PNG_BYTES) + fake = FakeClient( + FakeResponse( + { + "choices": [ + { + "message": { + "content": "done", + "images": [{"image_url": {"url": PNG_DATA_URL}}], + } + } + ] + } + ) + ) + client = OpenRouterImageGenerationClient( + api_key="sk-or-test", + api_base="https://openrouter.ai/api/v1/", + extra_headers={"X-Test": "1"}, + client=fake, # type: ignore[arg-type] + ) + + response = await client.generate( + prompt="make this blue", + model="openai/gpt-5.4-image-2", + reference_images=[str(ref)], + aspect_ratio="16:9", + image_size="2K", + ) + + assert isinstance(response, GeneratedImageResponse) + assert response.images == [PNG_DATA_URL] + assert response.content == "done" + + call = fake.calls[0] + assert call["url"] == "https://openrouter.ai/api/v1/chat/completions" + assert call["headers"]["Authorization"] == "Bearer sk-or-test" + assert call["headers"]["X-Test"] == "1" + body = call["json"] + assert body["modalities"] == ["image", "text"] + assert body["image_config"] == {"aspect_ratio": "16:9", "image_size": "2K"} + assert body["messages"][0]["content"][0] == {"type": "text", "text": "make this blue"} + assert body["messages"][0]["content"][1]["image_url"]["url"].startswith("data:image/png;base64,") + + +@pytest.mark.asyncio +async def test_openrouter_image_generation_requires_images() -> None: + fake = FakeClient(FakeResponse({"choices": [{"message": {"content": "text only"}}]})) + client = OpenRouterImageGenerationClient(api_key="sk-or-test", client=fake) # type: ignore[arg-type] + + with pytest.raises(ImageGenerationError, match="returned no images"): + await client.generate(prompt="draw", model="model") + + +@pytest.mark.asyncio +async def test_openrouter_image_generation_requires_api_key() -> None: + client = OpenRouterImageGenerationClient(api_key=None) + + with pytest.raises(ImageGenerationError, match="API key"): + await client.generate(prompt="draw", model="model") + + +@pytest.mark.asyncio +async def test_aihubmix_image_generation_payload_and_response() -> None: + raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,") + fake = FakeClient(FakeResponse({"output": {"b64_json": [{"bytesBase64": raw_b64}]}})) + client = AIHubMixImageGenerationClient( + api_key="sk-ahm-test", + api_base="https://aihubmix.com/v1/", + extra_headers={"APP-Code": "nanobot"}, + extra_body={"quality": "low"}, + client=fake, # type: ignore[arg-type] + ) + + response = await client.generate( + prompt="draw a logo", + model="gpt-image-2-free", + aspect_ratio="16:9", + image_size="1K", + ) + + assert response.images == [PNG_DATA_URL] + call = fake.calls[0] + assert call["url"] == "https://aihubmix.com/v1/models/openai/gpt-image-2-free/predictions" + assert call["headers"]["Authorization"] == "Bearer sk-ahm-test" + assert call["headers"]["APP-Code"] == "nanobot" + assert call["json"] == { + "input": { + "prompt": "draw a logo", + "n": 1, + "size": "1536x1024", + "quality": "low", + } + } + + +@pytest.mark.asyncio +async def test_aihubmix_image_edit_payload_uses_reference_images(tmp_path: Path) -> None: + raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,") + fake = FakeClient(FakeResponse({"output": [{"b64_json": raw_b64}]})) + ref = tmp_path / "ref.png" + ref.write_bytes(PNG_BYTES) + client = AIHubMixImageGenerationClient( + api_key="sk-ahm-test", + client=fake, # type: ignore[arg-type] + ) + + response = await client.generate( + prompt="edit this", + model="gpt-image-2-free", + reference_images=[str(ref)], + aspect_ratio="1:1", + ) + + assert response.images == [PNG_DATA_URL] + call = fake.calls[0] + assert call["url"] == "https://aihubmix.com/v1/models/openai/gpt-image-2-free/predictions" + assert call["json"]["input"]["prompt"] == "edit this" + assert call["json"]["input"]["n"] == 1 + assert call["json"]["input"]["size"] == "1024x1024" + assert call["json"]["input"]["image"].startswith("data:image/png;base64,") + + +@pytest.mark.asyncio +async def test_aihubmix_image_generation_downloads_url_response() -> None: + fake = FakeClient(FakeResponse({"data": [{"url": "https://cdn.example/image.png"}]})) + fake.get_response = FakeResponse({}, content=PNG_BYTES) + client = AIHubMixImageGenerationClient( + api_key="sk-ahm-test", + client=fake, # type: ignore[arg-type] + ) + + response = await client.generate(prompt="draw", model="gpt-image-2-free") + + assert response.images[0].startswith("data:image/png;base64,") + assert fake.get_calls[0]["url"] == "https://cdn.example/image.png" diff --git a/tests/tools/test_image_generation_tool.py b/tests/tools/test_image_generation_tool.py new file mode 100644 index 000000000..2afdbdff2 --- /dev/null +++ b/tests/tools/test_image_generation_tool.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + +from nanobot.agent.tools.image_generation import ImageGenerationTool +from nanobot.config.loader import set_config_path +from nanobot.config.schema import ImageGenerationToolConfig, ProviderConfig +from nanobot.providers.image_generation import GeneratedImageResponse + +PNG_BYTES = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + b"\x00\x00\x00\x01\x08\x04\x00\x00\x00\xb5\x1c\x0c\x02" + b"\x00\x00\x00\x0bIDATx\xdacd\xfc\xff\x1f\x00\x03\x03" + b"\x02\x00\xef\xbf\xa7\xdb\x00\x00\x00\x00IEND\xaeB`\x82" +) +PNG_DATA_URL = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=" +) + + +class FakeImageClient: + instances: list["FakeImageClient"] = [] + + def __init__(self, **kwargs: Any) -> None: + self.kwargs = kwargs + self.calls: list[dict[str, Any]] = [] + self.instances.append(self) + + async def generate(self, **kwargs: Any) -> GeneratedImageResponse: + self.calls.append(kwargs) + return GeneratedImageResponse(images=[PNG_DATA_URL], content="", raw={}) + + +@pytest.mark.asyncio +async def test_generate_image_tool_stores_artifact_and_source_images( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + set_config_path(tmp_path / "config.json") + FakeImageClient.instances = [] + monkeypatch.setattr( + "nanobot.agent.tools.image_generation.OpenRouterImageGenerationClient", + FakeImageClient, + ) + ref = tmp_path / "ref.png" + ref.write_bytes(PNG_BYTES) + tool = ImageGenerationTool( + workspace=tmp_path, + config=ImageGenerationToolConfig(enabled=True, max_images_per_turn=2), + provider_config=ProviderConfig(api_key="sk-or-test"), + ) + + result = await tool.execute( + prompt="make this blue", + reference_images=["ref.png"], + aspect_ratio="16:9", + image_size="2K", + count=2, + ) + + payload = json.loads(result) + artifacts = payload["artifacts"] + assert len(artifacts) == 2 + assert Path(artifacts[0]["path"]).is_file() + assert artifacts[0]["source_images"] == [str(ref.resolve())] + assert artifacts[0]["model"] == "openai/gpt-5.4-image-2" + + fake = FakeImageClient.instances[0] + assert fake.kwargs["api_key"] == "sk-or-test" + assert len(fake.calls) == 2 + assert fake.calls[0]["aspect_ratio"] == "16:9" + assert fake.calls[0]["image_size"] == "2K" + + +@pytest.mark.asyncio +async def test_generate_image_tool_reports_missing_key(tmp_path: Path) -> None: + tool = ImageGenerationTool( + workspace=tmp_path, + config=ImageGenerationToolConfig(enabled=True), + provider_config=ProviderConfig(), + ) + + result = await tool.execute(prompt="draw") + + assert result.startswith("Error: OpenRouter API key is not configured") + + +@pytest.mark.asyncio +async def test_generate_image_tool_selects_aihubmix_provider( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + set_config_path(tmp_path / "config.json") + FakeImageClient.instances = [] + monkeypatch.setattr( + "nanobot.agent.tools.image_generation.AIHubMixImageGenerationClient", + FakeImageClient, + ) + tool = ImageGenerationTool( + workspace=tmp_path, + config=ImageGenerationToolConfig( + enabled=True, + provider="aihubmix", + model="gpt-image-2-free", + ), + provider_configs={ + "openrouter": ProviderConfig(api_key="sk-or-test"), + "aihubmix": ProviderConfig(api_key="sk-ahm-test", extra_body={"quality": "low"}), + }, + ) + + result = await tool.execute(prompt="draw a poster", aspect_ratio="3:4") + + payload = json.loads(result) + assert len(payload["artifacts"]) == 1 + fake = FakeImageClient.instances[0] + assert fake.kwargs["api_key"] == "sk-ahm-test" + assert fake.kwargs["extra_body"] == {"quality": "low"} + assert fake.calls[0]["model"] == "gpt-image-2-free" + assert fake.calls[0]["aspect_ratio"] == "3:4" + + +@pytest.mark.asyncio +async def test_generate_image_tool_reports_missing_aihubmix_key(tmp_path: Path) -> None: + tool = ImageGenerationTool( + workspace=tmp_path, + config=ImageGenerationToolConfig(enabled=True, provider="aihubmix"), + provider_configs={"aihubmix": ProviderConfig()}, + ) + + result = await tool.execute(prompt="draw") + + assert result.startswith("Error: AIHubMix API key is not configured") + + +@pytest.mark.asyncio +async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None: + set_config_path(tmp_path / "config.json") + outside = tmp_path.parent / "outside.png" + outside.write_bytes(PNG_BYTES) + tool = ImageGenerationTool( + workspace=tmp_path, + config=ImageGenerationToolConfig(enabled=True), + provider_config=ProviderConfig(api_key="sk-or-test"), + ) + + result = await tool.execute(prompt="edit", reference_images=[str(outside)]) + + assert "reference_images must be inside the workspace" in result diff --git a/tests/tools/test_message_tool.py b/tests/tools/test_message_tool.py index 915fb0c98..decb5ba08 100644 --- a/tests/tools/test_message_tool.py +++ b/tests/tools/test_message_tool.py @@ -55,6 +55,25 @@ async def test_message_tool_marks_channel_delivery_only_when_enabled() -> None: assert sent[1].metadata == {"_record_channel_delivery": True} +@pytest.mark.asyncio +async def test_message_tool_records_media_deliveries() -> None: + sent: list[OutboundMessage] = [] + + async def _send(msg: OutboundMessage) -> None: + sent.append(msg) + + tool = MessageTool(send_callback=_send) + + await tool.execute( + content="image", + channel="websocket", + chat_id="chat-1", + media=["/tmp/generated.png"], + ) + + assert sent[0].metadata == {"_record_channel_delivery": True} + + @pytest.mark.asyncio async def test_message_tool_inherits_metadata_for_same_target() -> None: sent: list[OutboundMessage] = [] diff --git a/tests/utils/test_artifacts.py b/tests/utils/test_artifacts.py new file mode 100644 index 000000000..128e394f5 --- /dev/null +++ b/tests/utils/test_artifacts.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path + +import pytest + +from nanobot.config.loader import set_config_path +from nanobot.utils.artifacts import ( + ArtifactError, + decode_image_data_url, + generated_image_paths_from_messages, + generated_image_tool_result, + store_generated_image_artifact, +) + +PNG_DATA_URL = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=" +) + + +def test_decode_image_data_url_validates_image_payload() -> None: + raw, mime = decode_image_data_url(PNG_DATA_URL) + + assert raw.startswith(b"\x89PNG") + assert mime == "image/png" + + with pytest.raises(ArtifactError): + decode_image_data_url("data:image/png;base64,not-base64") + + +def test_store_generated_image_artifact_writes_image_and_sidecar(tmp_path: Path) -> None: + set_config_path(tmp_path / "config.json") + created_at = datetime(2026, 5, 8, 12, 0, tzinfo=timezone.utc) + + artifact = store_generated_image_artifact( + PNG_DATA_URL, + prompt="draw a tiny pixel", + model="openai/gpt-5.4-image-2", + source_images=["/tmp/ref.png"], + save_dir="generated", + created_at=created_at, + ) + + image_path = Path(artifact["path"]) + assert image_path.is_file() + assert image_path.parent == tmp_path / "media" / "generated" / "2026-05-08" + assert artifact["id"].startswith("img_") + assert artifact["mime"] == "image/png" + + sidecar = image_path.with_suffix(".json") + metadata = json.loads(sidecar.read_text(encoding="utf-8")) + assert metadata["path"] == str(image_path) + assert metadata["source_images"] == ["/tmp/ref.png"] + + +def test_store_generated_image_artifact_rejects_unsafe_save_dir(tmp_path: Path) -> None: + set_config_path(tmp_path / "config.json") + + with pytest.raises(ArtifactError): + store_generated_image_artifact( + PNG_DATA_URL, + prompt="x", + model="m", + save_dir="../outside", + ) + + +def test_generated_image_paths_from_tool_results() -> None: + result = generated_image_tool_result( + [ + {"id": "img_1", "path": "/tmp/one.png"}, + {"id": "img_2", "path": "/tmp/two.png"}, + ] + ) + + assert generated_image_paths_from_messages( + [ + {"role": "tool", "name": "generate_image", "content": result}, + {"role": "tool", "name": "other", "content": result}, + ] + ) == ["/tmp/one.png", "/tmp/two.png"] diff --git a/tests/utils/test_image_generation_intent.py b/tests/utils/test_image_generation_intent.py new file mode 100644 index 000000000..d1c896723 --- /dev/null +++ b/tests/utils/test_image_generation_intent.py @@ -0,0 +1,25 @@ +from nanobot.utils.image_generation_intent import image_generation_prompt + + +def test_image_generation_prompt_ignores_plain_messages() -> None: + assert image_generation_prompt("hello", {}) == "hello" + + +def test_image_generation_prompt_uses_auto_aspect_instruction() -> None: + prompt = image_generation_prompt( + "Draw a poster", + {"image_generation": {"enabled": True, "aspect_ratio": None}}, + ) + + assert "Draw a poster" in prompt + assert "Use the generate_image tool" in prompt + assert "Choose the most suitable aspect_ratio yourself" in prompt + + +def test_image_generation_prompt_uses_selected_aspect_ratio() -> None: + prompt = image_generation_prompt( + "Draw a banner", + {"image_generation": {"enabled": True, "aspect_ratio": "16:9"}}, + ) + + assert "aspect_ratio='16:9'" in prompt diff --git a/webui/src/App.tsx b/webui/src/App.tsx index 9eca02688..6798247d3 100644 --- a/webui/src/App.tsx +++ b/webui/src/App.tsx @@ -17,7 +17,7 @@ import { saveSecret, } from "@/lib/bootstrap"; import { NanobotClient } from "@/lib/nanobot-client"; -import { ClientProvider } from "@/providers/ClientProvider"; +import { ClientProvider, useClient } from "@/providers/ClientProvider"; import type { ChatSummary } from "@/lib/types"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; @@ -34,6 +34,7 @@ type BootState = }; const SIDEBAR_STORAGE_KEY = "nanobot-webui.sidebar"; +const RESTART_STARTED_KEY = "nanobot-webui.restartStartedAt"; const SIDEBAR_WIDTH = 272; type ShellView = "chat" | "settings"; @@ -237,6 +238,7 @@ export default function App() { function Shell({ onModelNameChange, onLogout }: { onModelNameChange: (modelName: string | null) => void; onLogout: () => void }) { const { t, i18n } = useTranslation(); + const { client } = useClient(); const { theme, toggle } = useTheme(); const { sessions, loading, refresh, createChat, deleteChat } = useSessions(); const [activeKey, setActiveKey] = useState(null); @@ -249,6 +251,8 @@ function Shell({ onModelNameChange, onLogout }: { onModelNameChange: (modelName: label: string; } | null>(null); const lastSessionsLen = useRef(0); + const restartSawDisconnectRef = useRef(false); + const [restartToast, setRestartToast] = useState(null); useEffect(() => { try { @@ -326,6 +330,43 @@ function Shell({ onModelNameChange, onLogout }: { onModelNameChange: (modelName: setMobileSidebarOpen(false); }, []); + const onRestart = useCallback(() => { + const chatId = activeSession?.chatId ?? client.defaultChatId; + if (!chatId) return; + restartSawDisconnectRef.current = false; + try { + window.localStorage.setItem(RESTART_STARTED_KEY, String(Date.now())); + } catch { + // ignore storage errors + } + client.sendMessage(chatId, "/restart"); + }, [activeSession?.chatId, client]); + + useEffect(() => { + return client.onStatus((status) => { + let startedAt = 0; + try { + startedAt = Number(window.localStorage.getItem(RESTART_STARTED_KEY) ?? "0"); + } catch { + startedAt = 0; + } + if (!startedAt) return; + if (status !== "open") { + restartSawDisconnectRef.current = true; + return; + } + const elapsedMs = Date.now() - startedAt; + if (!restartSawDisconnectRef.current && elapsedMs < 1500) return; + try { + window.localStorage.removeItem(RESTART_STARTED_KEY); + } catch { + // ignore storage errors + } + setRestartToast(t("app.restart.completed", { seconds: (elapsedMs / 1000).toFixed(1) })); + window.setTimeout(() => setRestartToast(null), 3_500); + }); + }, [client, t]); + const onTurnEnd = useCallback(() => { void refresh(); }, [refresh]); @@ -414,6 +455,7 @@ function Shell({ onModelNameChange, onLogout }: { onModelNameChange: (modelName: onBackToChat={() => setView("chat")} onModelNameChange={onModelNameChange} onLogout={onLogout} + onRestart={onRestart} /> ) : ( setPendingDelete(null)} onConfirm={onConfirmDelete} /> + {restartToast ? ( +
+ {restartToast} +
+ ) : null} ); } diff --git a/webui/src/components/MessageBubble.tsx b/webui/src/components/MessageBubble.tsx index ced6ead1d..dd16f45ad 100644 --- a/webui/src/components/MessageBubble.tsx +++ b/webui/src/components/MessageBubble.tsx @@ -142,7 +142,9 @@ function MessageMedia({ align === "right" ? "justify-end" : "justify-start", )} > - {images.length > 0 ? : null} + {images.length > 0 ? ( + + ) : null} {nonImages.map((item, i) => ( ))} @@ -208,9 +210,11 @@ function MediaCell({ media }: { media: UIMediaAttachment }) { function UserImages({ images, align = "right", + size = "compact", }: { images: UIImage[]; align?: "left" | "right"; + size?: "compact" | "large"; }) { const { t } = useTranslation(); // Only real-URL images can open in the lightbox; historical-replay @@ -230,6 +234,7 @@ function UserImages({
@@ -237,6 +242,7 @@ function UserImages({ void; }) { const hasUrl = typeof image.url === "string" && image.url.length > 0; const tileClasses = cn( - "relative h-24 w-24 overflow-hidden rounded-[14px] border border-border/60 bg-muted/40", + "relative overflow-hidden border border-border/60 bg-muted/40", + size === "large" + ? "h-56 w-[min(100%,22rem)] rounded-[18px] sm:h-72 sm:w-[26rem]" + : "h-24 w-24 rounded-[14px]", "shadow-[0_6px_18px_-14px_rgba(0,0,0,0.45)]", ); @@ -296,7 +307,7 @@ function UserImageCell({ loading="lazy" decoding="async" draggable={false} - className="h-full w-full object-cover" + className={cn("h-full w-full", size === "large" ? "object-contain" : "object-cover")} /> ); diff --git a/webui/src/components/settings/SettingsView.tsx b/webui/src/components/settings/SettingsView.tsx index 0f3b5b77d..5586e9d08 100644 --- a/webui/src/components/settings/SettingsView.tsx +++ b/webui/src/components/settings/SettingsView.tsx @@ -16,12 +16,14 @@ interface SettingsViewProps { onBackToChat: () => void; onModelNameChange: (modelName: string | null) => void; onLogout?: () => void; + onRestart?: () => void; } export function SettingsView({ onBackToChat, onModelNameChange, onLogout, + onRestart, }: SettingsViewProps) { const { token } = useClient(); const [settings, setSettings] = useState(null); @@ -119,6 +121,7 @@ export function SettingsView({ saving={saving} onSave={save} onLogout={onLogout} + onRestart={onRestart} /> ) : null} @@ -134,6 +137,7 @@ function SettingsSection({ saving, onSave, onLogout, + onRestart, }: { form: { model: string; @@ -148,6 +152,7 @@ function SettingsSection({ saving: boolean; onSave: () => void; onLogout?: () => void; + onRestart?: () => void; }) { const { t } = useTranslation(); return ( @@ -200,6 +205,19 @@ function SettingsSection({ + {onRestart && ( +
+

{t("app.system.section")}

+ + + + + +
+ )} + {onLogout && (

{t("app.account.section")}

diff --git a/webui/src/components/thread/ThreadComposer.tsx b/webui/src/components/thread/ThreadComposer.tsx index ac994f89e..824a70938 100644 --- a/webui/src/components/thread/ThreadComposer.tsx +++ b/webui/src/components/thread/ThreadComposer.tsx @@ -10,6 +10,8 @@ import { Activity, ArrowUp, BookOpen, + Check, + ChevronDown, CircleHelp, History, ImageIcon, @@ -33,7 +35,7 @@ import { MAX_IMAGES_PER_MESSAGE, } from "@/hooks/useAttachedImages"; import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop"; -import type { SendImage } from "@/hooks/useNanobotStream"; +import type { SendImage, SendOptions } from "@/hooks/useNanobotStream"; import type { SlashCommand } from "@/lib/types"; import { cn } from "@/lib/utils"; @@ -48,13 +50,16 @@ function formatBytes(n: number): string { } interface ThreadComposerProps { - onSend: (content: string, images?: SendImage[]) => void; + onSend: (content: string, images?: SendImage[], options?: SendOptions) => void; disabled?: boolean; placeholder?: string; isStreaming?: boolean; modelLabel?: string | null; variant?: "thread" | "hero"; slashCommands?: SlashCommand[]; + imageMode?: boolean; + onImageModeChange?: (enabled: boolean) => void; + onStop?: () => void; } const COMMAND_ICONS: Record = { @@ -69,10 +74,28 @@ const COMMAND_ICONS: Record = { "undo-2": Undo2, }; +type ImageAspectRatio = "auto" | "1:1" | "3:4" | "9:16" | "4:3" | "16:9"; + +const IMAGE_ASPECT_RATIOS: ImageAspectRatio[] = ["auto", "1:1", "3:4", "9:16", "4:3", "16:9"]; + function slashCommandI18nKey(command: string): string { return command.replace(/^\//, "").replace(/-/g, "_"); } +function scrollNearestOverflowParent(target: EventTarget | null, deltaY: number) { + if (!(target instanceof Element) || deltaY === 0) return; + let el: HTMLElement | null = target.parentElement; + while (el) { + const style = window.getComputedStyle(el); + const canScroll = /(auto|scroll)/.test(style.overflowY) && el.scrollHeight > el.clientHeight; + if (canScroll) { + el.scrollTop += deltaY; + return; + } + el = el.parentElement; + } +} + export function ThreadComposer({ onSend, disabled, @@ -81,19 +104,38 @@ export function ThreadComposer({ modelLabel = null, variant = "thread", slashCommands = [], + imageMode: controlledImageMode, + onImageModeChange, + onStop, }: ThreadComposerProps) { const { t } = useTranslation(); const [value, setValue] = useState(""); const [inlineError, setInlineError] = useState(null); const [slashMenuDismissed, setSlashMenuDismissed] = useState(false); const [selectedCommandIndex, setSelectedCommandIndex] = useState(0); + const [uncontrolledImageMode, setUncontrolledImageMode] = useState(false); + const [imageAspectRatio, setImageAspectRatio] = useState("auto"); + const [aspectMenuOpen, setAspectMenuOpen] = useState(false); const textareaRef = useRef(null); const fileInputRef = useRef(null); + const aspectControlRef = useRef(null); const chipRefs = useRef(new Map()); const isHero = variant === "hero"; + const imageMode = controlledImageMode ?? uncontrolledImageMode; + const setImageMode = useCallback( + (enabled: boolean) => { + if (controlledImageMode === undefined) { + setUncontrolledImageMode(enabled); + } + onImageModeChange?.(enabled); + }, + [controlledImageMode, onImageModeChange], + ); const resolvedPlaceholder = isStreaming ? t("thread.composer.placeholderStreaming") - : placeholder ?? t("thread.composer.placeholderThread"); + : imageMode + ? t("thread.composer.imageMode.placeholder") + : placeholder ?? t("thread.composer.placeholderThread"); const { images, enqueue, remove, clear, encoding, full } = useAttachedImages(); @@ -190,6 +232,38 @@ export function ThreadComposer({ } }, [filteredSlashCommands.length, selectedCommandIndex]); + useEffect(() => { + if (!aspectMenuOpen) return; + + const closeOnPointerDown = (event: PointerEvent) => { + const target = event.target; + if (target instanceof Node && aspectControlRef.current?.contains(target)) return; + setAspectMenuOpen(false); + }; + const closeOnKeyDown = (event: KeyboardEvent) => { + if (event.key === "Escape") { + setAspectMenuOpen(false); + textareaRef.current?.focus(); + } + }; + const closeOnScroll = () => setAspectMenuOpen(false); + const closeOnWheel = (event: WheelEvent) => { + setAspectMenuOpen(false); + scrollNearestOverflowParent(event.target, event.deltaY); + }; + + document.addEventListener("pointerdown", closeOnPointerDown, true); + document.addEventListener("keydown", closeOnKeyDown); + document.addEventListener("scroll", closeOnScroll, true); + document.addEventListener("wheel", closeOnWheel, { capture: true, passive: true }); + return () => { + document.removeEventListener("pointerdown", closeOnPointerDown, true); + document.removeEventListener("keydown", closeOnKeyDown); + document.removeEventListener("scroll", closeOnScroll, true); + document.removeEventListener("wheel", closeOnWheel, true); + }; + }, [aspectMenuOpen]); + const resizeTextarea = useCallback(() => { requestAnimationFrame(() => { const el = textareaRef.current; @@ -227,7 +301,15 @@ export function ThreadComposer({ preview: { url: img.dataUrl, name: img.file.name }, })) : undefined; - onSend(trimmed, payload); + const options: SendOptions | undefined = imageMode + ? { + imageGeneration: { + enabled: true, + aspect_ratio: imageAspectRatio === "auto" ? null : imageAspectRatio, + }, + } + : undefined; + onSend(trimmed, payload, options); setValue(""); setInlineError(null); // Bubble owns the data URL copy; safe to revoke every staged blob @@ -235,7 +317,7 @@ export function ThreadComposer({ clear(); setSlashMenuDismissed(false); resizeTextarea(); - }, [canSend, clear, onSend, readyImages, resizeTextarea, value]); + }, [canSend, clear, imageAspectRatio, imageMode, onSend, readyImages, resizeTextarea, value]); const onKeyDown = (e: ReactKeyboardEvent) => { if (showSlashMenu) { @@ -312,6 +394,7 @@ export function ThreadComposer({ ); const attachButtonDisabled = disabled || full; + const showStopButton = isStreaming && !!onStop; return (
+
+ + {imageMode ? ( + + ) : null} + {imageMode && aspectMenuOpen ? ( + { + setImageAspectRatio(ratio); + setAspectMenuOpen(false); + textareaRef.current?.focus(); + }} + /> + ) : null} +
{modelLabel ? ( + ); + })} +
+ ); +} + function SlashCommandPalette({ commands, selectedIndex, @@ -511,7 +706,7 @@ function SlashCommandPalette({ aria-label={t("thread.composer.slash.ariaLabel")} className={cn( "absolute bottom-full left-1/2 z-30 mb-2 max-h-[22rem] w-[calc(100%-0.5rem)] -translate-x-1/2 overflow-hidden rounded-[18px] border", - "border-border/65 bg-popover/98 p-1.5 text-popover-foreground shadow-[0_18px_55px_rgba(15,23,42,0.18)] backdrop-blur", + "border-border/65 bg-popover p-1.5 text-popover-foreground shadow-[0_18px_55px_rgba(15,23,42,0.18)]", "dark:border-white/10 dark:shadow-[0_22px_55px_rgba(0,0,0,0.45)]", isHero ? "max-w-[58rem]" : "max-w-[49.5rem]", )} diff --git a/webui/src/components/thread/ThreadShell.tsx b/webui/src/components/thread/ThreadShell.tsx index f15551ce5..464dd38cb 100644 --- a/webui/src/components/thread/ThreadShell.tsx +++ b/webui/src/components/thread/ThreadShell.tsx @@ -4,9 +4,12 @@ import { BookOpen, ChevronRight, Code2, + ImageIcon, LayoutGrid, Lightbulb, MoreHorizontal, + Palette, + Sparkles, } from "lucide-react"; import { useTranslation } from "react-i18next"; @@ -15,7 +18,7 @@ import { ThreadComposer } from "@/components/thread/ThreadComposer"; import { ThreadHeader } from "@/components/thread/ThreadHeader"; import { StreamErrorNotice } from "@/components/thread/StreamErrorNotice"; import { ThreadViewport } from "@/components/thread/ThreadViewport"; -import { useNanobotStream } from "@/hooks/useNanobotStream"; +import { useNanobotStream, type SendImage, type SendOptions } from "@/hooks/useNanobotStream"; import { useSessionHistory } from "@/hooks/useSessions"; import { listSlashCommands } from "@/lib/api"; import type { ChatSummary, SlashCommand, UIMessage } from "@/lib/types"; @@ -52,6 +55,21 @@ const QUICK_ACTION_KEYS = [ { key: "more", icon: MoreHorizontal, tone: "text-muted-foreground/65" }, ] as const; +const IMAGE_QUICK_ACTION_KEYS = [ + { key: "icon", icon: ImageIcon, tone: "text-[#4f9de8]" }, + { key: "sticker", icon: Sparkles, tone: "text-[#f25b8f]" }, + { key: "poster", icon: Palette, tone: "text-[#eba45d]" }, + { key: "product", icon: LayoutGrid, tone: "text-[#53c59d]" }, + { key: "portrait", icon: ImageIcon, tone: "text-[#a877e7]" }, + { key: "edit", icon: MoreHorizontal, tone: "text-muted-foreground/65" }, +] as const; + +interface PendingFirstMessage { + content: string; + images?: SendImage[]; + options?: SendOptions; +} + export function ThreadShell({ session, title, @@ -67,10 +85,11 @@ export function ThreadShell({ const chatId = session?.chatId ?? null; const historyKey = session?.key ?? null; const { messages: historical, loading, hasPendingToolCalls } = useSessionHistory(historyKey); - const { client, modelName, token } = useClient(); + const { modelName, token } = useClient(); const [booting, setBooting] = useState(false); const [slashCommands, setSlashCommands] = useState([]); - const pendingFirstRef = useRef(null); + const [heroImageMode, setHeroImageMode] = useState(false); + const pendingFirstRef = useRef(null); const messageCacheRef = useRef>(new Map()); const lastCachedChatIdRef = useRef(null); @@ -82,6 +101,7 @@ export function ThreadShell({ messages, isStreaming, send, + stop, setMessages, streamError, dismissStreamError, @@ -109,7 +129,11 @@ export function ThreadShell({ // When the user switches away and back, keep the local in-memory thread // state (including not-yet-persisted messages) instead of replacing it with // whatever the history endpoint currently knows about. - setMessages(cached && cached.length > 0 ? cached : historical); + setMessages((prev) => { + if (cached && cached.length > 0) return cached; + if (historical.length === 0 && prev.length > 0) return prev; + return historical; + }); // eslint-disable-next-line react-hooks/exhaustive-deps }, [loading, chatId, historical]); @@ -142,18 +166,9 @@ export function ThreadShell({ const pending = pendingFirstRef.current; if (!pending) return; pendingFirstRef.current = null; - client.sendMessage(chatId, pending); - setMessages((prev) => [ - ...prev, - { - id: crypto.randomUUID(), - role: "user", - content: pending, - createdAt: Date.now(), - }, - ]); + send(pending.content, pending.images, pending.options); setBooting(false); - }, [chatId, client, setMessages]); + }, [chatId, send]); useEffect(() => { let cancelled = false; @@ -171,10 +186,10 @@ export function ThreadShell({ }, [token]); const handleWelcomeSend = useCallback( - async (content: string) => { + async (content: string, images?: SendImage[], options?: SendOptions) => { if (booting) return; setBooting(true); - pendingFirstRef.current = content; + pendingFirstRef.current = { content, images, options }; const newId = await onCreateChat?.(); if (!newId) { pendingFirstRef.current = null; @@ -186,20 +201,27 @@ export function ThreadShell({ const handleQuickAction = useCallback( (prompt: string) => { + const options: SendOptions | undefined = heroImageMode + ? { imageGeneration: { enabled: true, aspect_ratio: null } } + : undefined; if (session) { - send(prompt); + send(prompt, undefined, options); return; } - void handleWelcomeSend(prompt); + void handleWelcomeSend(prompt, undefined, options); }, - [handleWelcomeSend, send, session], + [handleWelcomeSend, heroImageMode, send, session], ); + const quickActionItems = heroImageMode ? IMAGE_QUICK_ACTION_KEYS : QUICK_ACTION_KEYS; + const quickActionPrefix = heroImageMode + ? "thread.empty.imageQuickActions" + : "thread.empty.quickActions"; const quickActions = (
- {QUICK_ACTION_KEYS.map(({ key, icon: Icon, tone }) => { - const title = t(`thread.empty.quickActions.${key}.title`); - const prompt = t(`thread.empty.quickActions.${key}.prompt`); + {quickActionItems.map(({ key, icon: Icon, tone }) => { + const title = t(`${quickActionPrefix}.${key}.title`); + const prompt = t(`${quickActionPrefix}.${key}.prompt`); return ( + +
, + ); + + const aspectButton = screen.getByRole("button", { name: "Image aspect ratio" }); + fireEvent.click(aspectButton); + expect(screen.getByRole("listbox", { name: "Image aspect ratio" })).toBeInTheDocument(); + + fireEvent.pointerDown(screen.getByRole("button", { name: "outside" })); + expect(screen.queryByRole("listbox", { name: "Image aspect ratio" })).not.toBeInTheDocument(); + + fireEvent.click(aspectButton); + fireEvent.keyDown(document, { key: "Escape" }); + expect(screen.queryByRole("listbox", { name: "Image aspect ratio" })).not.toBeInTheDocument(); + + fireEvent.click(aspectButton); + fireEvent.wheel(screen.getByRole("listbox", { name: "Image aspect ratio" }), { deltaY: 120 }); + expect(screen.queryByRole("listbox", { name: "Image aspect ratio" })).not.toBeInTheDocument(); + }); }); diff --git a/webui/src/tests/thread-shell.test.tsx b/webui/src/tests/thread-shell.test.tsx index 3dd47f6b8..f46cbc5ee 100644 --- a/webui/src/tests/thread-shell.test.tsx +++ b/webui/src/tests/thread-shell.test.tsx @@ -250,6 +250,64 @@ describe("ThreadShell", () => { expect(onNewChat).not.toHaveBeenCalled(); }); + it("keeps the first landing message when new chat history is still empty", async () => { + const client = makeClient(); + const onCreateChat = vi.fn().mockResolvedValue("chat-new"); + vi.stubGlobal( + "fetch", + vi.fn(async () => ({ + ok: false, + status: 404, + json: async () => ({}), + })), + ); + + const { rerender } = render( + wrap( + client, + {}} + onCreateChat={onCreateChat} + />, + ), + ); + + fireEvent.change(screen.getByLabelText("Message input"), { + target: { value: "first message should stay" }, + }); + fireEvent.click(screen.getByRole("button", { name: "Send message" })); + + await waitFor(() => expect(onCreateChat).toHaveBeenCalledTimes(1)); + + await act(async () => { + rerender( + wrap( + client, + {}} + onCreateChat={onCreateChat} + />, + ), + ); + }); + + await waitFor(() => + expect(client.sendMessage).toHaveBeenCalledWith( + "chat-new", + "first message should stay", + undefined, + ), + ); + await waitFor(() => + expect(screen.getByText("first message should stay")).toBeInTheDocument(), + ); + expect(screen.queryByText("What can I do for you?")).not.toBeInTheDocument(); + }); + it("sends quick action prompts from the empty thread landing", async () => { const client = makeClient(); const onNewChat = vi.fn().mockResolvedValue("chat-a"); @@ -566,6 +624,30 @@ describe("ThreadShell", () => { expect(screen.queryByRole("listbox", { name: "Slash commands" })).not.toBeInTheDocument(); }); + it("switches welcome quick actions when image mode is enabled", async () => { + const client = makeClient(); + render( + wrap( + client, + {}} + onNewChat={() => {}} + />, + ), + ); + await act(async () => {}); + + expect(screen.getByText("Write code")).toBeInTheDocument(); + expect(screen.queryByText("Design an app icon")).not.toBeInTheDocument(); + + fireEvent.click(screen.getByRole("button", { name: "Toggle image generation mode" })); + + expect(screen.getByText("Design an app icon")).toBeInTheDocument(); + expect(screen.queryByText("Write code")).not.toBeInTheDocument(); + }); + it("surfaces a dismissible banner when the stream reports message_too_big", async () => { const client = makeClient(); const onNewChat = vi.fn().mockResolvedValue("chat-a"); diff --git a/webui/src/tests/useNanobotStream.test.tsx b/webui/src/tests/useNanobotStream.test.tsx index 155ec118e..a9e92086f 100644 --- a/webui/src/tests/useNanobotStream.test.tsx +++ b/webui/src/tests/useNanobotStream.test.tsx @@ -134,6 +134,89 @@ describe("useNanobotStream", () => { ]); }); + it("suppresses redundant stream confirmation after assistant media", () => { + const fake = fakeClient(); + const { result } = renderHook(() => useNanobotStream("chat-img-result", EMPTY_MESSAGES), { + wrapper: wrap(fake.client), + }); + + act(() => { + fake.emit("chat-img-result", { + event: "message", + chat_id: "chat-img-result", + text: "image ready", + media_urls: [{ url: "/api/media/sig/image", name: "generated.png" }], + }); + fake.emit("chat-img-result", { + event: "message", + chat_id: "chat-img-result", + text: "message()", + kind: "tool_hint", + }); + fake.emit("chat-img-result", { + event: "delta", + chat_id: "chat-img-result", + text: "发送成功", + }); + fake.emit("chat-img-result", { + event: "stream_end", + chat_id: "chat-img-result", + }); + fake.emit("chat-img-result", { + event: "turn_end", + chat_id: "chat-img-result", + }); + }); + + expect(result.current.messages).toHaveLength(1); + expect(result.current.messages[0].content).toBe("image ready"); + expect(result.current.messages[0].media).toHaveLength(1); + }); + + it("passes image generation options to the websocket client", () => { + const fake = fakeClient(); + const { result } = renderHook(() => useNanobotStream("chat-img", EMPTY_MESSAGES), { + wrapper: wrap(fake.client), + }); + + act(() => { + result.current.send( + "draw a square icon", + undefined, + { imageGeneration: { enabled: true, aspect_ratio: "1:1" } }, + ); + }); + + expect(fake.client.sendMessage).toHaveBeenCalledWith( + "chat-img", + "draw a square icon", + undefined, + { imageGeneration: { enabled: true, aspect_ratio: "1:1" } }, + ); + }); + + it("stops the active turn without adding a user slash command bubble", () => { + const fake = fakeClient(); + const { result } = renderHook(() => useNanobotStream("chat-stop", EMPTY_MESSAGES), { + wrapper: wrap(fake.client), + }); + + act(() => { + result.current.send("long task"); + }); + expect(result.current.messages).toHaveLength(1); + expect(result.current.isStreaming).toBe(true); + + act(() => { + result.current.stop(); + }); + + expect(fake.client.sendMessage).toHaveBeenLastCalledWith("chat-stop", "/stop"); + expect(result.current.isStreaming).toBe(false); + expect(result.current.messages).toHaveLength(1); + expect(result.current.messages[0].content).toBe("long task"); + }); + it("keeps assistant buttons on complete messages", () => { const fake = fakeClient(); const { result } = renderHook(() => useNanobotStream("chat-q", EMPTY_MESSAGES), {