diff --git a/docs/image-generation.md b/docs/image-generation.md index 6ca7ed3fd..a9d6b620c 100644 --- a/docs/image-generation.md +++ b/docs/image-generation.md @@ -23,7 +23,7 @@ The feature is disabled by default. Enable it in `~/.nanobot/config.json`, confi } ``` -See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, and Gemini configuration examples. +See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, Gemini, Ollama, and StepFun configuration examples. > [!TIP] > Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup. @@ -46,7 +46,7 @@ The WebUI hides provider storage details from the user. The agent sees the saved | Option | Type | Default | Description | |--------|------|---------|-------------| | `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool | -| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `stepfun` | +| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, `stepfun` | | `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name | | `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one | | `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` | @@ -168,6 +168,31 @@ For reference-image edits, use a Gemini Flash image model: Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged). +### Ollama + +Ollama's experimental native image generation API works with local servers and hosted ollama.com models. Local access at `http://localhost:11434/api` does not require an API key; set `providers.ollama.apiKey` only when targeting `https://ollama.com/api`. + +```json +{ + "providers": { + "ollama": { + "apiBase": "http://localhost:11434/api" + } + }, + "tools": { + "imageGeneration": { + "enabled": true, + "provider": "ollama", + "model": "x/z-image-turbo", + "defaultAspectRatio": "16:9", + "defaultImageSize": "2K" + } + } +} +``` + +Ollama maps `defaultAspectRatio` and `defaultImageSize` to native `width` and `height` values. Reference images are not supported by this integration. + ### StepFun StepFun (阶跃星辰) `step-image-edit-2` supports text-to-image generation. The `step-1x-medium` variant additionally supports **style-reference** image edits, where a reference image guides the visual style of the output. @@ -274,7 +299,7 @@ Use the reference image. Keep the same robot and composition, change the palette |---------|-------| | `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway | | Missing API key error | Configure `providers..apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process | -| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, or `stepfun` | +| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, or `stepfun` | | AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally | | Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later | | Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files | diff --git a/nanobot/agent/tools/image_generation.py b/nanobot/agent/tools/image_generation.py index f2f599ded..58eaaf7d8 100644 --- a/nanobot/agent/tools/image_generation.py +++ b/nanobot/agent/tools/image_generation.py @@ -21,6 +21,7 @@ from nanobot.providers.image_generation import ( ImageGenerationProvider, get_image_gen_provider, ) +from nanobot.providers.registry import find_by_name from nanobot.utils.artifacts import ( ArtifactError, generated_image_tool_result, @@ -117,6 +118,10 @@ class ImageGenerationTool(Tool): def _provider_config(self) -> ProviderConfig | None: return self.provider_configs.get(self.config.provider) + def _provider_allows_missing_api_key(self) -> bool: + spec = find_by_name(self.config.provider) + return bool(spec and (spec.is_local or spec.is_direct or spec.is_oauth)) + def _provider_client(self) -> ImageGenerationProvider | None: provider = self._provider_config() cls = get_image_gen_provider(self.config.provider) @@ -174,7 +179,7 @@ class ImageGenerationTool(Tool): if client is None: return f"Error: unsupported image generation provider '{self.config.provider}'" provider = self._provider_config() - if not provider or not provider.api_key: + if not self._provider_allows_missing_api_key() and (not provider or not provider.api_key): return self._missing_api_key_error() requested = count or 1 diff --git a/nanobot/providers/image_generation.py b/nanobot/providers/image_generation.py index 501b98fd2..3ea8c374a 100644 --- a/nanobot/providers/image_generation.py +++ b/nanobot/providers/image_generation.py @@ -4,6 +4,7 @@ from __future__ import annotations import base64 import binascii +import re from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path @@ -31,6 +32,14 @@ _AIHUBMIX_ASPECT_RATIO_SIZES = { } _GEMINI_DEFAULT_TIMEOUT_S = 120.0 _GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"} +_OLLAMA_DEFAULT_SIDE = 1024 +_OLLAMA_SIZE_PRESETS = { + "1K": 1024, + "2K": 2048, + "4K": 4096, +} +_OLLAMA_EXPLICIT_SIZE_RE = re.compile(r"^\s*(\d+)\s*[xX]\s*(\d+)\s*$") +_OLLAMA_ASPECT_RATIO_RE = re.compile(r"^\s*(\d+)\s*:\s*(\d+)\s*$") class ImageGenerationError(RuntimeError): @@ -429,6 +438,139 @@ def _http_error_detail(response: httpx.Response) -> str: return response.text[:500] or "" +def _round_to_multiple(value: float, multiple: int = 8) -> int: + rounded = int(round(value / multiple) * multiple) + return max(multiple, rounded) + + +def _ollama_dimensions(aspect_ratio: str | None, image_size: str | None) -> tuple[int, int]: + if image_size: + size = image_size.strip() + explicit = _OLLAMA_EXPLICIT_SIZE_RE.fullmatch(size) + if explicit: + return int(explicit.group(1)), int(explicit.group(2)) + long_side = _OLLAMA_SIZE_PRESETS.get(size.upper(), _OLLAMA_DEFAULT_SIDE) + else: + long_side = _OLLAMA_DEFAULT_SIDE + + if not aspect_ratio: + return long_side, long_side + + ratio = _OLLAMA_ASPECT_RATIO_RE.fullmatch(aspect_ratio.strip()) + if ratio is None: + return long_side, long_side + + width_ratio = int(ratio.group(1)) + height_ratio = int(ratio.group(2)) + if width_ratio <= 0 or height_ratio <= 0: + return long_side, long_side + + if width_ratio >= height_ratio: + width = long_side + height = _round_to_multiple(long_side * height_ratio / width_ratio) + else: + height = long_side + width = _round_to_multiple(long_side * width_ratio / height_ratio) + return max(8, width), max(8, height) + + +def _ollama_image_data_url(value: str) -> str: + if value.startswith("data:image/"): + return value + return _b64_image_data_url(value) + + +def _ollama_images_from_payload(payload: dict[str, Any]) -> list[str]: + images: list[str] = [] + + def collect(value: Any) -> None: + if isinstance(value, str) and value: + images.append(_ollama_image_data_url(value)) + elif isinstance(value, list): + for item in value: + collect(item) + + collect(payload.get("image")) + collect(payload.get("images")) + return images + + +class OllamaImageGenerationClient(ImageGenerationProvider): + """Async client for Ollama native image generation models.""" + + provider_name = "ollama" + default_timeout = 300.0 + + def _default_base_url(self) -> str: + return "http://localhost:11434/api" + + def _resolve_base_url(self, api_base: str | None) -> str: + if api_base: + base = api_base.rstrip("/") + if base.endswith("/v1"): + return f"{base[:-3]}/api" + return base + return self._default_base_url() + + async def generate( + self, + *, + prompt: str, + model: str, + reference_images: list[str] | None = None, + aspect_ratio: str | None = None, + image_size: str | None = None, + ) -> GeneratedImageResponse: + if reference_images: + raise ImageGenerationError( + "Ollama image generation does not support reference images" + ) + + width, height = _ollama_dimensions(aspect_ratio, image_size) + body: dict[str, Any] = { + "model": model, + "prompt": prompt, + "width": width, + "height": height, + "steps": 0, + } + body.update(self.extra_body) + body["stream"] = False + + headers = { + "Content-Type": "application/json", + **self.extra_headers, + } + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" + + url = f"{self.api_base}/generate" + response = await self._http_post(url, headers=headers, body=body) + + try: + response.raise_for_status() + except httpx.HTTPStatusError as exc: + detail = _http_error_detail(response) + logger.error( + "Ollama image generation failed (HTTP {}): {}", + response.status_code, + detail, + ) + raise ImageGenerationError( + f"Ollama image generation failed (HTTP {response.status_code}): {detail}" + ) from exc + + data = response.json() + images = _ollama_images_from_payload(data) + + self._require_images(images, data) + + response_text = data.get("response") + content = response_text if isinstance(response_text, str) else "" + + return GeneratedImageResponse(images=images, content=content, raw=data) + + class GeminiImageGenerationClient(ImageGenerationProvider): """Async client for Gemini/Imagen image generation via the Generative Language API.""" @@ -886,5 +1028,6 @@ def _stepfun_images_from_payload(payload: dict[str, Any]) -> list[str]: register_image_gen_provider(OpenRouterImageGenerationClient) register_image_gen_provider(AIHubMixImageGenerationClient) register_image_gen_provider(GeminiImageGenerationClient) +register_image_gen_provider(OllamaImageGenerationClient) register_image_gen_provider(MiniMaxImageGenerationClient) register_image_gen_provider(StepFunImageGenerationClient) diff --git a/tests/providers/test_image_generation.py b/tests/providers/test_image_generation.py index 3bee376d8..701f09f0a 100644 --- a/tests/providers/test_image_generation.py +++ b/tests/providers/test_image_generation.py @@ -13,6 +13,7 @@ from nanobot.providers.image_generation import ( GeneratedImageResponse, ImageGenerationError, MiniMaxImageGenerationClient, + OllamaImageGenerationClient, OpenRouterImageGenerationClient, StepFunImageGenerationClient, ) @@ -133,6 +134,54 @@ async def test_openrouter_image_generation_requires_api_key() -> None: await client.generate(prompt="draw", model="model") +@pytest.mark.asyncio +async def test_ollama_image_generation_payload_and_response() -> None: + raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,") + fake = FakeClient(FakeResponse({"image": raw_b64})) + client = OllamaImageGenerationClient( + api_key="ollama-test", + api_base="http://localhost:11434/v1/", + extra_headers={"X-Test": "1"}, + extra_body={"seed": 123}, + client=fake, # type: ignore[arg-type] + ) + + response = await client.generate( + prompt="a sunset", + model="x/z-image-turbo", + aspect_ratio="16:9", + image_size="1K", + ) + + assert response.images == [PNG_DATA_URL] + assert response.content == "" + + call = fake.calls[0] + assert call["url"] == "http://localhost:11434/api/generate" + assert call["headers"]["Authorization"] == "Bearer ollama-test" + assert call["headers"]["X-Test"] == "1" + body = call["json"] + assert body["model"] == "x/z-image-turbo" + assert body["prompt"] == "a sunset" + assert body["width"] == 1024 + assert body["height"] == 576 + assert body["steps"] == 0 + assert body["stream"] is False + assert body["seed"] == 123 + + +@pytest.mark.asyncio +async def test_ollama_image_generation_rejects_reference_images() -> None: + client = OllamaImageGenerationClient(api_key=None) + + with pytest.raises(ImageGenerationError, match="reference images"): + await client.generate( + prompt="edit this", + model="x/z-image-turbo", + reference_images=["ref.png"], + ) + + @pytest.mark.asyncio async def test_aihubmix_image_generation_payload_and_response() -> None: raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,") diff --git a/tests/tools/test_image_generation_tool.py b/tests/tools/test_image_generation_tool.py index 92ed8a339..f5d2d9183 100644 --- a/tests/tools/test_image_generation_tool.py +++ b/tests/tools/test_image_generation_tool.py @@ -138,6 +138,39 @@ async def test_generate_image_tool_reports_missing_aihubmix_key(tmp_path: Path) assert result.startswith("Error: AIHubMix API key is not configured") +@pytest.mark.asyncio +async def test_generate_image_tool_allows_ollama_without_api_key( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + set_config_path(tmp_path / "config.json") + FakeImageClient.instances = [] + monkeypatch.setattr( + "nanobot.agent.tools.image_generation.get_image_gen_provider", + lambda name: FakeImageClient if name == "ollama" else None, + ) + tool = ImageGenerationTool( + workspace=tmp_path, + config=ImageGenerationToolConfig( + enabled=True, + provider="ollama", + model="x/z-image-turbo", + ), + provider_configs={"ollama": ProviderConfig(api_base="http://localhost:11434/v1")}, + ) + + result = await tool.execute(prompt="draw a cat") + + payload = json.loads(result) + assert len(payload["artifacts"]) == 1 + + fake = FakeImageClient.instances[0] + assert fake.kwargs["api_key"] is None + assert fake.kwargs["api_base"] == "http://localhost:11434/v1" + assert fake.calls[0]["aspect_ratio"] == "1:1" + assert fake.calls[0]["image_size"] == "1K" + + @pytest.mark.asyncio async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None: set_config_path(tmp_path / "config.json")