Merge PR #3946: Add Ollama image generation support

Add Ollama image generation support
This commit is contained in:
Xubin Ren 2026-05-22 22:06:28 +08:00 committed by GitHub
commit ddfe5c3bdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 253 additions and 3 deletions

View File

@ -23,7 +23,7 @@ The feature is disabled by default. Enable it in `~/.nanobot/config.json`, confi
} }
``` ```
See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, and Gemini configuration examples. See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, Gemini, Ollama, and StepFun configuration examples.
> [!TIP] > [!TIP]
> Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup. > Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup.
@ -46,7 +46,7 @@ The WebUI hides provider storage details from the user. The agent sees the saved
| Option | Type | Default | Description | | Option | Type | Default | Description |
|--------|------|---------|-------------| |--------|------|---------|-------------|
| `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool | | `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool |
| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `stepfun` | | `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, `stepfun` |
| `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name | | `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name |
| `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one | | `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one |
| `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` | | `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` |
@ -168,6 +168,31 @@ For reference-image edits, use a Gemini Flash image model:
Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged). Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged).
### Ollama
Ollama's experimental native image generation API works with local servers and hosted ollama.com models. Local access at `http://localhost:11434/api` does not require an API key; set `providers.ollama.apiKey` only when targeting `https://ollama.com/api`.
```json
{
"providers": {
"ollama": {
"apiBase": "http://localhost:11434/api"
}
},
"tools": {
"imageGeneration": {
"enabled": true,
"provider": "ollama",
"model": "x/z-image-turbo",
"defaultAspectRatio": "16:9",
"defaultImageSize": "2K"
}
}
}
```
Ollama maps `defaultAspectRatio` and `defaultImageSize` to native `width` and `height` values. Reference images are not supported by this integration.
### StepFun ### StepFun
StepFun (阶跃星辰) `step-image-edit-2` supports text-to-image generation. The `step-1x-medium` variant additionally supports **style-reference** image edits, where a reference image guides the visual style of the output. StepFun (阶跃星辰) `step-image-edit-2` supports text-to-image generation. The `step-1x-medium` variant additionally supports **style-reference** image edits, where a reference image guides the visual style of the output.
@ -274,7 +299,7 @@ Use the reference image. Keep the same robot and composition, change the palette
|---------|-------| |---------|-------|
| `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway | | `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway |
| Missing API key error | Configure `providers.<provider>.apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process | | Missing API key error | Configure `providers.<provider>.apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process |
| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, or `stepfun` | | `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, or `stepfun` |
| AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally | | AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally |
| Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later | | Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later |
| Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files | | Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files |

View File

@ -5,6 +5,7 @@ from __future__ import annotations
import asyncio import asyncio
import base64 import base64
import binascii import binascii
import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
@ -32,6 +33,14 @@ _AIHUBMIX_ASPECT_RATIO_SIZES = {
} }
_GEMINI_DEFAULT_TIMEOUT_S = 120.0 _GEMINI_DEFAULT_TIMEOUT_S = 120.0
_GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"} _GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"}
_OLLAMA_DEFAULT_SIDE = 1024
_OLLAMA_SIZE_PRESETS = {
"1K": 1024,
"2K": 2048,
"4K": 4096,
}
_OLLAMA_EXPLICIT_SIZE_RE = re.compile(r"^\s*(\d+)\s*[xX]\s*(\d+)\s*$")
_OLLAMA_ASPECT_RATIO_RE = re.compile(r"^\s*(\d+)\s*:\s*(\d+)\s*$")
class ImageGenerationError(RuntimeError): class ImageGenerationError(RuntimeError):
@ -430,6 +439,139 @@ def _http_error_detail(response: httpx.Response) -> str:
return response.text[:500] or "<empty response body>" return response.text[:500] or "<empty response body>"
def _round_to_multiple(value: float, multiple: int = 8) -> int:
rounded = int(round(value / multiple) * multiple)
return max(multiple, rounded)
def _ollama_dimensions(aspect_ratio: str | None, image_size: str | None) -> tuple[int, int]:
if image_size:
size = image_size.strip()
explicit = _OLLAMA_EXPLICIT_SIZE_RE.fullmatch(size)
if explicit:
return int(explicit.group(1)), int(explicit.group(2))
long_side = _OLLAMA_SIZE_PRESETS.get(size.upper(), _OLLAMA_DEFAULT_SIDE)
else:
long_side = _OLLAMA_DEFAULT_SIDE
if not aspect_ratio:
return long_side, long_side
ratio = _OLLAMA_ASPECT_RATIO_RE.fullmatch(aspect_ratio.strip())
if ratio is None:
return long_side, long_side
width_ratio = int(ratio.group(1))
height_ratio = int(ratio.group(2))
if width_ratio <= 0 or height_ratio <= 0:
return long_side, long_side
if width_ratio >= height_ratio:
width = long_side
height = _round_to_multiple(long_side * height_ratio / width_ratio)
else:
height = long_side
width = _round_to_multiple(long_side * width_ratio / height_ratio)
return max(8, width), max(8, height)
def _ollama_image_data_url(value: str) -> str:
if value.startswith("data:image/"):
return value
return _b64_image_data_url(value)
def _ollama_images_from_payload(payload: dict[str, Any]) -> list[str]:
images: list[str] = []
def collect(value: Any) -> None:
if isinstance(value, str) and value:
images.append(_ollama_image_data_url(value))
elif isinstance(value, list):
for item in value:
collect(item)
collect(payload.get("image"))
collect(payload.get("images"))
return images
class OllamaImageGenerationClient(ImageGenerationProvider):
"""Async client for Ollama native image generation models."""
provider_name = "ollama"
default_timeout = 300.0
def _default_base_url(self) -> str:
return "http://localhost:11434/api"
def _resolve_base_url(self, api_base: str | None) -> str:
if api_base:
base = api_base.rstrip("/")
if base.endswith("/v1"):
return f"{base[:-3]}/api"
return base
return self._default_base_url()
async def generate(
self,
*,
prompt: str,
model: str,
reference_images: list[str] | None = None,
aspect_ratio: str | None = None,
image_size: str | None = None,
) -> GeneratedImageResponse:
if reference_images:
raise ImageGenerationError(
"Ollama image generation does not support reference images"
)
width, height = _ollama_dimensions(aspect_ratio, image_size)
body: dict[str, Any] = {
"model": model,
"prompt": prompt,
"width": width,
"height": height,
"steps": 0,
}
body.update(self.extra_body)
body["stream"] = False
headers = {
"Content-Type": "application/json",
**self.extra_headers,
}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
url = f"{self.api_base}/generate"
response = await self._http_post(url, headers=headers, body=body)
try:
response.raise_for_status()
except httpx.HTTPStatusError as exc:
detail = _http_error_detail(response)
logger.error(
"Ollama image generation failed (HTTP {}): {}",
response.status_code,
detail,
)
raise ImageGenerationError(
f"Ollama image generation failed (HTTP {response.status_code}): {detail}"
) from exc
data = response.json()
images = _ollama_images_from_payload(data)
self._require_images(images, data)
response_text = data.get("response")
content = response_text if isinstance(response_text, str) else ""
return GeneratedImageResponse(images=images, content=content, raw=data)
class GeminiImageGenerationClient(ImageGenerationProvider): class GeminiImageGenerationClient(ImageGenerationProvider):
"""Async client for Gemini/Imagen image generation via the Generative Language API.""" """Async client for Gemini/Imagen image generation via the Generative Language API."""
@ -1307,6 +1449,7 @@ def _stepfun_images_from_payload(payload: dict[str, Any]) -> list[str]:
register_image_gen_provider(AIHubMixImageGenerationClient) register_image_gen_provider(AIHubMixImageGenerationClient)
register_image_gen_provider(CodexImageGenerationClient) register_image_gen_provider(CodexImageGenerationClient)
register_image_gen_provider(GeminiImageGenerationClient) register_image_gen_provider(GeminiImageGenerationClient)
register_image_gen_provider(OllamaImageGenerationClient)
register_image_gen_provider(MiniMaxImageGenerationClient) register_image_gen_provider(MiniMaxImageGenerationClient)
register_image_gen_provider(OpenAIImageGenerationClient) register_image_gen_provider(OpenAIImageGenerationClient)
register_image_gen_provider(OpenRouterImageGenerationClient) register_image_gen_provider(OpenRouterImageGenerationClient)

View File

@ -14,6 +14,7 @@ from nanobot.providers.image_generation import (
GeneratedImageResponse, GeneratedImageResponse,
ImageGenerationError, ImageGenerationError,
MiniMaxImageGenerationClient, MiniMaxImageGenerationClient,
OllamaImageGenerationClient,
OpenAIImageGenerationClient, OpenAIImageGenerationClient,
OpenRouterImageGenerationClient, OpenRouterImageGenerationClient,
StepFunImageGenerationClient, StepFunImageGenerationClient,
@ -146,6 +147,54 @@ async def test_openrouter_image_generation_requires_api_key() -> None:
await client.generate(prompt="draw", model="model") await client.generate(prompt="draw", model="model")
@pytest.mark.asyncio
async def test_ollama_image_generation_payload_and_response() -> None:
raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
fake = FakeClient(FakeResponse({"image": raw_b64}))
client = OllamaImageGenerationClient(
api_key="ollama-test",
api_base="http://localhost:11434/v1/",
extra_headers={"X-Test": "1"},
extra_body={"seed": 123},
client=fake, # type: ignore[arg-type]
)
response = await client.generate(
prompt="a sunset",
model="x/z-image-turbo",
aspect_ratio="16:9",
image_size="1K",
)
assert response.images == [PNG_DATA_URL]
assert response.content == ""
call = fake.calls[0]
assert call["url"] == "http://localhost:11434/api/generate"
assert call["headers"]["Authorization"] == "Bearer ollama-test"
assert call["headers"]["X-Test"] == "1"
body = call["json"]
assert body["model"] == "x/z-image-turbo"
assert body["prompt"] == "a sunset"
assert body["width"] == 1024
assert body["height"] == 576
assert body["steps"] == 0
assert body["stream"] is False
assert body["seed"] == 123
@pytest.mark.asyncio
async def test_ollama_image_generation_rejects_reference_images() -> None:
client = OllamaImageGenerationClient(api_key=None)
with pytest.raises(ImageGenerationError, match="reference images"):
await client.generate(
prompt="edit this",
model="x/z-image-turbo",
reference_images=["ref.png"],
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_aihubmix_image_generation_payload_and_response() -> None: async def test_aihubmix_image_generation_payload_and_response() -> None:
raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,") raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")

View File

@ -138,6 +138,39 @@ async def test_generate_image_tool_reports_missing_aihubmix_key(tmp_path: Path)
assert result.startswith("Error: AIHubMix API key is not configured") assert result.startswith("Error: AIHubMix API key is not configured")
@pytest.mark.asyncio
async def test_generate_image_tool_allows_ollama_without_api_key(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
set_config_path(tmp_path / "config.json")
FakeImageClient.instances = []
monkeypatch.setattr(
"nanobot.agent.tools.image_generation.get_image_gen_provider",
lambda name: FakeImageClient if name == "ollama" else None,
)
tool = ImageGenerationTool(
workspace=tmp_path,
config=ImageGenerationToolConfig(
enabled=True,
provider="ollama",
model="x/z-image-turbo",
),
provider_configs={"ollama": ProviderConfig(api_base="http://localhost:11434/v1")},
)
result = await tool.execute(prompt="draw a cat")
payload = json.loads(result)
assert len(payload["artifacts"]) == 1
fake = FakeImageClient.instances[0]
assert fake.kwargs["api_key"] is None
assert fake.kwargs["api_base"] == "http://localhost:11434/v1"
assert fake.calls[0]["aspect_ratio"] == "1:1"
assert fake.calls[0]["image_size"] == "1K"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None: async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None:
set_config_path(tmp_path / "config.json") set_config_path(tmp_path / "config.json")