Merge PR #3946: Add Ollama image generation support

Add Ollama image generation support
2026-05-24 10:32:45 +00:00 · 2026-05-22 22:06:28 +08:00 · 2026-05-22 22:06:28 +08:00 · ddfe5c3bdf
commit ddfe5c3bdf
parent 8c0b2c1a29 f5534bcaa0
4 changed files with 253 additions and 3 deletions
--- a/docs/image-generation.md
+++ b/docs/image-generation.md
@ -23,7 +23,7 @@ The feature is disabled by default. Enable it in `~/.nanobot/config.json`, confi
 }
 ```
-See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, and Gemini configuration examples.
+See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, Gemini, Ollama, and StepFun configuration examples.
 > [!TIP]
 > Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup.
@ -46,7 +46,7 @@ The WebUI hides provider storage details from the user. The agent sees the saved
 | Option | Type | Default | Description |
 |--------|------|---------|-------------|
 | `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool |
-| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `stepfun` |
+| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, `stepfun` |
 | `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name |
 | `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one |
 | `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` |
@ -168,6 +168,31 @@ For reference-image edits, use a Gemini Flash image model:
 Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged).
 ### Ollama
 Ollama's experimental native image generation API works with local servers and hosted ollama.com models. Local access at `http://localhost:11434/api` does not require an API key; set `providers.ollama.apiKey` only when targeting `https://ollama.com/api`.
 ```json
 {
  "providers": {
    "ollama": {
      "apiBase": "http://localhost:11434/api"
    }
  },
  "tools": {
    "imageGeneration": {
      "enabled": true,
      "provider": "ollama",
      "model": "x/z-image-turbo",
      "defaultAspectRatio": "16:9",
      "defaultImageSize": "2K"
    }
  }
 }
 ```
 Ollama maps `defaultAspectRatio` and `defaultImageSize` to native `width` and `height` values. Reference images are not supported by this integration.
 ### StepFun
 StepFun (阶跃星辰) `step-image-edit-2` supports text-to-image generation.  The `step-1x-medium` variant additionally supports **style-reference** image edits, where a reference image guides the visual style of the output.
@ -274,7 +299,7 @@ Use the reference image. Keep the same robot and composition, change the palette
 |---------|-------|
 | `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway |
 | Missing API key error | Configure `providers.<provider>.apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process |
-| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, or `stepfun` |
+| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, or `stepfun` |
 | AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally |
 | Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later |
 | Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files |
--- a/nanobot/providers/image_generation.py
+++ b/nanobot/providers/image_generation.py
@ -5,6 +5,7 @@ from __future__ import annotations
 import asyncio
 import base64
 import binascii
 import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
@ -32,6 +33,14 @@ _AIHUBMIX_ASPECT_RATIO_SIZES = {
 }
 _GEMINI_DEFAULT_TIMEOUT_S = 120.0
 _GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"}
 _OLLAMA_DEFAULT_SIDE = 1024
 _OLLAMA_SIZE_PRESETS = {
    "1K": 1024,
    "2K": 2048,
    "4K": 4096,
 }
 _OLLAMA_EXPLICIT_SIZE_RE = re.compile(r"^\s*(\d+)\s*[xX]\s*(\d+)\s*$")
 _OLLAMA_ASPECT_RATIO_RE = re.compile(r"^\s*(\d+)\s*:\s*(\d+)\s*$")
 class ImageGenerationError(RuntimeError):
@ -430,6 +439,139 @@ def _http_error_detail(response: httpx.Response) -> str:
    return response.text[:500] or "<empty response body>"
 def _round_to_multiple(value: float, multiple: int = 8) -> int:
    rounded = int(round(value / multiple) * multiple)
    return max(multiple, rounded)
 def _ollama_dimensions(aspect_ratio: str | None, image_size: str | None) -> tuple[int, int]:
    if image_size:
        size = image_size.strip()
        explicit = _OLLAMA_EXPLICIT_SIZE_RE.fullmatch(size)
        if explicit:
            return int(explicit.group(1)), int(explicit.group(2))
        long_side = _OLLAMA_SIZE_PRESETS.get(size.upper(), _OLLAMA_DEFAULT_SIDE)
    else:
        long_side = _OLLAMA_DEFAULT_SIDE
    if not aspect_ratio:
        return long_side, long_side
    ratio = _OLLAMA_ASPECT_RATIO_RE.fullmatch(aspect_ratio.strip())
    if ratio is None:
        return long_side, long_side
    width_ratio = int(ratio.group(1))
    height_ratio = int(ratio.group(2))
    if width_ratio <= 0 or height_ratio <= 0:
        return long_side, long_side
    if width_ratio >= height_ratio:
        width = long_side
        height = _round_to_multiple(long_side * height_ratio / width_ratio)
    else:
        height = long_side
        width = _round_to_multiple(long_side * width_ratio / height_ratio)
    return max(8, width), max(8, height)
 def _ollama_image_data_url(value: str) -> str:
    if value.startswith("data:image/"):
        return value
    return _b64_image_data_url(value)
 def _ollama_images_from_payload(payload: dict[str, Any]) -> list[str]:
    images: list[str] = []
    def collect(value: Any) -> None:
        if isinstance(value, str) and value:
            images.append(_ollama_image_data_url(value))
        elif isinstance(value, list):
            for item in value:
                collect(item)
    collect(payload.get("image"))
    collect(payload.get("images"))
    return images
 class OllamaImageGenerationClient(ImageGenerationProvider):
    """Async client for Ollama native image generation models."""
    provider_name = "ollama"
    default_timeout = 300.0
    def _default_base_url(self) -> str:
        return "http://localhost:11434/api"
    def _resolve_base_url(self, api_base: str | None) -> str:
        if api_base:
            base = api_base.rstrip("/")
            if base.endswith("/v1"):
                return f"{base[:-3]}/api"
            return base
        return self._default_base_url()
    async def generate(
        self,
        *,
        prompt: str,
        model: str,
        reference_images: list[str] | None = None,
        aspect_ratio: str | None = None,
        image_size: str | None = None,
    ) -> GeneratedImageResponse:
        if reference_images:
            raise ImageGenerationError(
                "Ollama image generation does not support reference images"
            )
        width, height = _ollama_dimensions(aspect_ratio, image_size)
        body: dict[str, Any] = {
            "model": model,
            "prompt": prompt,
            "width": width,
            "height": height,
            "steps": 0,
        }
        body.update(self.extra_body)
        body["stream"] = False
        headers = {
            "Content-Type": "application/json",
            **self.extra_headers,
        }
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        url = f"{self.api_base}/generate"
        response = await self._http_post(url, headers=headers, body=body)
        try:
            response.raise_for_status()
        except httpx.HTTPStatusError as exc:
            detail = _http_error_detail(response)
            logger.error(
                "Ollama image generation failed (HTTP {}): {}",
                response.status_code,
                detail,
            )
            raise ImageGenerationError(
                f"Ollama image generation failed (HTTP {response.status_code}): {detail}"
            ) from exc
        data = response.json()
        images = _ollama_images_from_payload(data)
        self._require_images(images, data)
        response_text = data.get("response")
        content = response_text if isinstance(response_text, str) else ""
        return GeneratedImageResponse(images=images, content=content, raw=data)
 class GeminiImageGenerationClient(ImageGenerationProvider):
    """Async client for Gemini/Imagen image generation via the Generative Language API."""
@ -1307,6 +1449,7 @@ def _stepfun_images_from_payload(payload: dict[str, Any]) -> list[str]:
 register_image_gen_provider(AIHubMixImageGenerationClient)
 register_image_gen_provider(CodexImageGenerationClient)
 register_image_gen_provider(GeminiImageGenerationClient)
 register_image_gen_provider(OllamaImageGenerationClient)
 register_image_gen_provider(MiniMaxImageGenerationClient)
 register_image_gen_provider(OpenAIImageGenerationClient)
 register_image_gen_provider(OpenRouterImageGenerationClient)
--- a/tests/providers/test_image_generation.py
+++ b/tests/providers/test_image_generation.py
@ -14,6 +14,7 @@ from nanobot.providers.image_generation import (
    GeneratedImageResponse,
    ImageGenerationError,
    MiniMaxImageGenerationClient,
    OllamaImageGenerationClient,
    OpenAIImageGenerationClient,
    OpenRouterImageGenerationClient,
    StepFunImageGenerationClient,
@ -146,6 +147,54 @@ async def test_openrouter_image_generation_requires_api_key() -> None:
        await client.generate(prompt="draw", model="model")
@pytest.mark.asyncio
 async def test_ollama_image_generation_payload_and_response() -> None:
    raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
    fake = FakeClient(FakeResponse({"image": raw_b64}))
    client = OllamaImageGenerationClient(
        api_key="ollama-test",
        api_base="http://localhost:11434/v1/",
        extra_headers={"X-Test": "1"},
        extra_body={"seed": 123},
        client=fake,  # type: ignore[arg-type]
    )
    response = await client.generate(
        prompt="a sunset",
        model="x/z-image-turbo",
        aspect_ratio="16:9",
        image_size="1K",
    )
    assert response.images == [PNG_DATA_URL]
    assert response.content == ""
    call = fake.calls[0]
    assert call["url"] == "http://localhost:11434/api/generate"
    assert call["headers"]["Authorization"] == "Bearer ollama-test"
    assert call["headers"]["X-Test"] == "1"
    body = call["json"]
    assert body["model"] == "x/z-image-turbo"
    assert body["prompt"] == "a sunset"
    assert body["width"] == 1024
    assert body["height"] == 576
    assert body["steps"] == 0
    assert body["stream"] is False
    assert body["seed"] == 123
@pytest.mark.asyncio
 async def test_ollama_image_generation_rejects_reference_images() -> None:
    client = OllamaImageGenerationClient(api_key=None)
    with pytest.raises(ImageGenerationError, match="reference images"):
        await client.generate(
            prompt="edit this",
            model="x/z-image-turbo",
            reference_images=["ref.png"],
        )
@pytest.mark.asyncio
 async def test_aihubmix_image_generation_payload_and_response() -> None:
    raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
--- a/tests/tools/test_image_generation_tool.py
+++ b/tests/tools/test_image_generation_tool.py
@ -138,6 +138,39 @@ async def test_generate_image_tool_reports_missing_aihubmix_key(tmp_path: Path)
    assert result.startswith("Error: AIHubMix API key is not configured")
@pytest.mark.asyncio
 async def test_generate_image_tool_allows_ollama_without_api_key(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    set_config_path(tmp_path / "config.json")
    FakeImageClient.instances = []
    monkeypatch.setattr(
        "nanobot.agent.tools.image_generation.get_image_gen_provider",
        lambda name: FakeImageClient if name == "ollama" else None,
    )
    tool = ImageGenerationTool(
        workspace=tmp_path,
        config=ImageGenerationToolConfig(
            enabled=True,
            provider="ollama",
            model="x/z-image-turbo",
        ),
        provider_configs={"ollama": ProviderConfig(api_base="http://localhost:11434/v1")},
    )
    result = await tool.execute(prompt="draw a cat")
    payload = json.loads(result)
    assert len(payload["artifacts"]) == 1
    fake = FakeImageClient.instances[0]
    assert fake.kwargs["api_key"] is None
    assert fake.kwargs["api_base"] == "http://localhost:11434/v1"
    assert fake.calls[0]["aspect_ratio"] == "1:1"
    assert fake.calls[0]["image_size"] == "1K"
@pytest.mark.asyncio
 async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None:
    set_config_path(tmp_path / "config.json")