Add Ollama image generation support

This commit is contained in:
Haisam Abbas 2026-05-21 12:06:08 +05:00
parent eae51333ad
commit 84603f4cf2
5 changed files with 259 additions and 4 deletions

View File

@ -23,7 +23,7 @@ The feature is disabled by default. Enable it in `~/.nanobot/config.json`, confi
}
```
See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, and Gemini configuration examples.
See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, Gemini, Ollama, and StepFun configuration examples.
> [!TIP]
> Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup.
@ -46,7 +46,7 @@ The WebUI hides provider storage details from the user. The agent sees the saved
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool |
| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `stepfun` |
| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, `stepfun` |
| `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name |
| `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one |
| `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` |
@ -168,6 +168,31 @@ For reference-image edits, use a Gemini Flash image model:
Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged).
### Ollama
Ollama's experimental native image generation API works with local servers and hosted ollama.com models. Local access at `http://localhost:11434/api` does not require an API key; set `providers.ollama.apiKey` only when targeting `https://ollama.com/api`.
```json
{
"providers": {
"ollama": {
"apiBase": "http://localhost:11434/api"
}
},
"tools": {
"imageGeneration": {
"enabled": true,
"provider": "ollama",
"model": "x/z-image-turbo",
"defaultAspectRatio": "16:9",
"defaultImageSize": "2K"
}
}
}
```
Ollama maps `defaultAspectRatio` and `defaultImageSize` to native `width` and `height` values. Reference images are not supported by this integration.
### StepFun
StepFun (阶跃星辰) `step-image-edit-2` supports text-to-image generation. The `step-1x-medium` variant additionally supports **style-reference** image edits, where a reference image guides the visual style of the output.
@ -274,7 +299,7 @@ Use the reference image. Keep the same robot and composition, change the palette
|---------|-------|
| `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway |
| Missing API key error | Configure `providers.<provider>.apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process |
| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, or `stepfun` |
| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, or `stepfun` |
| AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally |
| Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later |
| Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files |

View File

@ -21,6 +21,7 @@ from nanobot.providers.image_generation import (
ImageGenerationProvider,
get_image_gen_provider,
)
from nanobot.providers.registry import find_by_name
from nanobot.utils.artifacts import (
ArtifactError,
generated_image_tool_result,
@ -117,6 +118,10 @@ class ImageGenerationTool(Tool):
def _provider_config(self) -> ProviderConfig | None:
return self.provider_configs.get(self.config.provider)
def _provider_allows_missing_api_key(self) -> bool:
spec = find_by_name(self.config.provider)
return bool(spec and (spec.is_local or spec.is_direct or spec.is_oauth))
def _provider_client(self) -> ImageGenerationProvider | None:
provider = self._provider_config()
cls = get_image_gen_provider(self.config.provider)
@ -174,7 +179,7 @@ class ImageGenerationTool(Tool):
if client is None:
return f"Error: unsupported image generation provider '{self.config.provider}'"
provider = self._provider_config()
if not provider or not provider.api_key:
if not self._provider_allows_missing_api_key() and (not provider or not provider.api_key):
return self._missing_api_key_error()
requested = count or 1

View File

@ -4,6 +4,7 @@ from __future__ import annotations
import base64
import binascii
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
@ -31,6 +32,14 @@ _AIHUBMIX_ASPECT_RATIO_SIZES = {
}
_GEMINI_DEFAULT_TIMEOUT_S = 120.0
_GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"}
_OLLAMA_DEFAULT_SIDE = 1024
_OLLAMA_SIZE_PRESETS = {
"1K": 1024,
"2K": 2048,
"4K": 4096,
}
_OLLAMA_EXPLICIT_SIZE_RE = re.compile(r"^\s*(\d+)\s*[xX]\s*(\d+)\s*$")
_OLLAMA_ASPECT_RATIO_RE = re.compile(r"^\s*(\d+)\s*:\s*(\d+)\s*$")
class ImageGenerationError(RuntimeError):
@ -429,6 +438,139 @@ def _http_error_detail(response: httpx.Response) -> str:
return response.text[:500] or "<empty response body>"
def _round_to_multiple(value: float, multiple: int = 8) -> int:
rounded = int(round(value / multiple) * multiple)
return max(multiple, rounded)
def _ollama_dimensions(aspect_ratio: str | None, image_size: str | None) -> tuple[int, int]:
if image_size:
size = image_size.strip()
explicit = _OLLAMA_EXPLICIT_SIZE_RE.fullmatch(size)
if explicit:
return int(explicit.group(1)), int(explicit.group(2))
long_side = _OLLAMA_SIZE_PRESETS.get(size.upper(), _OLLAMA_DEFAULT_SIDE)
else:
long_side = _OLLAMA_DEFAULT_SIDE
if not aspect_ratio:
return long_side, long_side
ratio = _OLLAMA_ASPECT_RATIO_RE.fullmatch(aspect_ratio.strip())
if ratio is None:
return long_side, long_side
width_ratio = int(ratio.group(1))
height_ratio = int(ratio.group(2))
if width_ratio <= 0 or height_ratio <= 0:
return long_side, long_side
if width_ratio >= height_ratio:
width = long_side
height = _round_to_multiple(long_side * height_ratio / width_ratio)
else:
height = long_side
width = _round_to_multiple(long_side * width_ratio / height_ratio)
return max(8, width), max(8, height)
def _ollama_image_data_url(value: str) -> str:
if value.startswith("data:image/"):
return value
return _b64_image_data_url(value)
def _ollama_images_from_payload(payload: dict[str, Any]) -> list[str]:
images: list[str] = []
def collect(value: Any) -> None:
if isinstance(value, str) and value:
images.append(_ollama_image_data_url(value))
elif isinstance(value, list):
for item in value:
collect(item)
collect(payload.get("image"))
collect(payload.get("images"))
return images
class OllamaImageGenerationClient(ImageGenerationProvider):
"""Async client for Ollama native image generation models."""
provider_name = "ollama"
default_timeout = 300.0
def _default_base_url(self) -> str:
return "http://localhost:11434/api"
def _resolve_base_url(self, api_base: str | None) -> str:
if api_base:
base = api_base.rstrip("/")
if base.endswith("/v1"):
return f"{base[:-3]}/api"
return base
return self._default_base_url()
async def generate(
self,
*,
prompt: str,
model: str,
reference_images: list[str] | None = None,
aspect_ratio: str | None = None,
image_size: str | None = None,
) -> GeneratedImageResponse:
if reference_images:
raise ImageGenerationError(
"Ollama image generation does not support reference images"
)
width, height = _ollama_dimensions(aspect_ratio, image_size)
body: dict[str, Any] = {
"model": model,
"prompt": prompt,
"width": width,
"height": height,
"steps": 0,
}
body.update(self.extra_body)
body["stream"] = False
headers = {
"Content-Type": "application/json",
**self.extra_headers,
}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
url = f"{self.api_base}/generate"
response = await self._http_post(url, headers=headers, body=body)
try:
response.raise_for_status()
except httpx.HTTPStatusError as exc:
detail = _http_error_detail(response)
logger.error(
"Ollama image generation failed (HTTP {}): {}",
response.status_code,
detail,
)
raise ImageGenerationError(
f"Ollama image generation failed (HTTP {response.status_code}): {detail}"
) from exc
data = response.json()
images = _ollama_images_from_payload(data)
self._require_images(images, data)
response_text = data.get("response")
content = response_text if isinstance(response_text, str) else ""
return GeneratedImageResponse(images=images, content=content, raw=data)
class GeminiImageGenerationClient(ImageGenerationProvider):
"""Async client for Gemini/Imagen image generation via the Generative Language API."""
@ -886,5 +1028,6 @@ def _stepfun_images_from_payload(payload: dict[str, Any]) -> list[str]:
register_image_gen_provider(OpenRouterImageGenerationClient)
register_image_gen_provider(AIHubMixImageGenerationClient)
register_image_gen_provider(GeminiImageGenerationClient)
register_image_gen_provider(OllamaImageGenerationClient)
register_image_gen_provider(MiniMaxImageGenerationClient)
register_image_gen_provider(StepFunImageGenerationClient)

View File

@ -13,6 +13,7 @@ from nanobot.providers.image_generation import (
GeneratedImageResponse,
ImageGenerationError,
MiniMaxImageGenerationClient,
OllamaImageGenerationClient,
OpenRouterImageGenerationClient,
StepFunImageGenerationClient,
)
@ -133,6 +134,54 @@ async def test_openrouter_image_generation_requires_api_key() -> None:
await client.generate(prompt="draw", model="model")
@pytest.mark.asyncio
async def test_ollama_image_generation_payload_and_response() -> None:
raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
fake = FakeClient(FakeResponse({"image": raw_b64}))
client = OllamaImageGenerationClient(
api_key="ollama-test",
api_base="http://localhost:11434/v1/",
extra_headers={"X-Test": "1"},
extra_body={"seed": 123},
client=fake, # type: ignore[arg-type]
)
response = await client.generate(
prompt="a sunset",
model="x/z-image-turbo",
aspect_ratio="16:9",
image_size="1K",
)
assert response.images == [PNG_DATA_URL]
assert response.content == ""
call = fake.calls[0]
assert call["url"] == "http://localhost:11434/api/generate"
assert call["headers"]["Authorization"] == "Bearer ollama-test"
assert call["headers"]["X-Test"] == "1"
body = call["json"]
assert body["model"] == "x/z-image-turbo"
assert body["prompt"] == "a sunset"
assert body["width"] == 1024
assert body["height"] == 576
assert body["steps"] == 0
assert body["stream"] is False
assert body["seed"] == 123
@pytest.mark.asyncio
async def test_ollama_image_generation_rejects_reference_images() -> None:
client = OllamaImageGenerationClient(api_key=None)
with pytest.raises(ImageGenerationError, match="reference images"):
await client.generate(
prompt="edit this",
model="x/z-image-turbo",
reference_images=["ref.png"],
)
@pytest.mark.asyncio
async def test_aihubmix_image_generation_payload_and_response() -> None:
raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")

View File

@ -138,6 +138,39 @@ async def test_generate_image_tool_reports_missing_aihubmix_key(tmp_path: Path)
assert result.startswith("Error: AIHubMix API key is not configured")
@pytest.mark.asyncio
async def test_generate_image_tool_allows_ollama_without_api_key(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
set_config_path(tmp_path / "config.json")
FakeImageClient.instances = []
monkeypatch.setattr(
"nanobot.agent.tools.image_generation.get_image_gen_provider",
lambda name: FakeImageClient if name == "ollama" else None,
)
tool = ImageGenerationTool(
workspace=tmp_path,
config=ImageGenerationToolConfig(
enabled=True,
provider="ollama",
model="x/z-image-turbo",
),
provider_configs={"ollama": ProviderConfig(api_base="http://localhost:11434/v1")},
)
result = await tool.execute(prompt="draw a cat")
payload = json.loads(result)
assert len(payload["artifacts"]) == 1
fake = FakeImageClient.instances[0]
assert fake.kwargs["api_key"] is None
assert fake.kwargs["api_base"] == "http://localhost:11434/v1"
assert fake.calls[0]["aspect_ratio"] == "1:1"
assert fake.calls[0]["image_size"] == "1K"
@pytest.mark.asyncio
async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None:
set_config_path(tmp_path / "config.json")