mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-23 18:12:32 +00:00
Add Ollama image generation support
This commit is contained in:
parent
eae51333ad
commit
84603f4cf2
@ -23,7 +23,7 @@ The feature is disabled by default. Enable it in `~/.nanobot/config.json`, confi
|
||||
}
|
||||
```
|
||||
|
||||
See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, and Gemini configuration examples.
|
||||
See [Provider Notes](#provider-notes) for AIHubMix, MiniMax, Gemini, Ollama, and StepFun configuration examples.
|
||||
|
||||
> [!TIP]
|
||||
> Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup.
|
||||
@ -46,7 +46,7 @@ The WebUI hides provider storage details from the user. The agent sees the saved
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool |
|
||||
| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `stepfun` |
|
||||
| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, `stepfun` |
|
||||
| `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name |
|
||||
| `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one |
|
||||
| `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` |
|
||||
@ -168,6 +168,31 @@ For reference-image edits, use a Gemini Flash image model:
|
||||
|
||||
Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged).
|
||||
|
||||
### Ollama
|
||||
|
||||
Ollama's experimental native image generation API works with local servers and hosted ollama.com models. Local access at `http://localhost:11434/api` does not require an API key; set `providers.ollama.apiKey` only when targeting `https://ollama.com/api`.
|
||||
|
||||
```json
|
||||
{
|
||||
"providers": {
|
||||
"ollama": {
|
||||
"apiBase": "http://localhost:11434/api"
|
||||
}
|
||||
},
|
||||
"tools": {
|
||||
"imageGeneration": {
|
||||
"enabled": true,
|
||||
"provider": "ollama",
|
||||
"model": "x/z-image-turbo",
|
||||
"defaultAspectRatio": "16:9",
|
||||
"defaultImageSize": "2K"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Ollama maps `defaultAspectRatio` and `defaultImageSize` to native `width` and `height` values. Reference images are not supported by this integration.
|
||||
|
||||
### StepFun
|
||||
|
||||
StepFun (阶跃星辰) `step-image-edit-2` supports text-to-image generation. The `step-1x-medium` variant additionally supports **style-reference** image edits, where a reference image guides the visual style of the output.
|
||||
@ -274,7 +299,7 @@ Use the reference image. Keep the same robot and composition, change the palette
|
||||
|---------|-------|
|
||||
| `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway |
|
||||
| Missing API key error | Configure `providers.<provider>.apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process |
|
||||
| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, or `stepfun` |
|
||||
| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, `minimax`, `gemini`, `ollama`, or `stepfun` |
|
||||
| AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally |
|
||||
| Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later |
|
||||
| Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files |
|
||||
|
||||
@ -21,6 +21,7 @@ from nanobot.providers.image_generation import (
|
||||
ImageGenerationProvider,
|
||||
get_image_gen_provider,
|
||||
)
|
||||
from nanobot.providers.registry import find_by_name
|
||||
from nanobot.utils.artifacts import (
|
||||
ArtifactError,
|
||||
generated_image_tool_result,
|
||||
@ -117,6 +118,10 @@ class ImageGenerationTool(Tool):
|
||||
def _provider_config(self) -> ProviderConfig | None:
|
||||
return self.provider_configs.get(self.config.provider)
|
||||
|
||||
def _provider_allows_missing_api_key(self) -> bool:
|
||||
spec = find_by_name(self.config.provider)
|
||||
return bool(spec and (spec.is_local or spec.is_direct or spec.is_oauth))
|
||||
|
||||
def _provider_client(self) -> ImageGenerationProvider | None:
|
||||
provider = self._provider_config()
|
||||
cls = get_image_gen_provider(self.config.provider)
|
||||
@ -174,7 +179,7 @@ class ImageGenerationTool(Tool):
|
||||
if client is None:
|
||||
return f"Error: unsupported image generation provider '{self.config.provider}'"
|
||||
provider = self._provider_config()
|
||||
if not provider or not provider.api_key:
|
||||
if not self._provider_allows_missing_api_key() and (not provider or not provider.api_key):
|
||||
return self._missing_api_key_error()
|
||||
|
||||
requested = count or 1
|
||||
|
||||
@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@ -31,6 +32,14 @@ _AIHUBMIX_ASPECT_RATIO_SIZES = {
|
||||
}
|
||||
_GEMINI_DEFAULT_TIMEOUT_S = 120.0
|
||||
_GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"}
|
||||
_OLLAMA_DEFAULT_SIDE = 1024
|
||||
_OLLAMA_SIZE_PRESETS = {
|
||||
"1K": 1024,
|
||||
"2K": 2048,
|
||||
"4K": 4096,
|
||||
}
|
||||
_OLLAMA_EXPLICIT_SIZE_RE = re.compile(r"^\s*(\d+)\s*[xX]\s*(\d+)\s*$")
|
||||
_OLLAMA_ASPECT_RATIO_RE = re.compile(r"^\s*(\d+)\s*:\s*(\d+)\s*$")
|
||||
|
||||
|
||||
class ImageGenerationError(RuntimeError):
|
||||
@ -429,6 +438,139 @@ def _http_error_detail(response: httpx.Response) -> str:
|
||||
return response.text[:500] or "<empty response body>"
|
||||
|
||||
|
||||
def _round_to_multiple(value: float, multiple: int = 8) -> int:
|
||||
rounded = int(round(value / multiple) * multiple)
|
||||
return max(multiple, rounded)
|
||||
|
||||
|
||||
def _ollama_dimensions(aspect_ratio: str | None, image_size: str | None) -> tuple[int, int]:
|
||||
if image_size:
|
||||
size = image_size.strip()
|
||||
explicit = _OLLAMA_EXPLICIT_SIZE_RE.fullmatch(size)
|
||||
if explicit:
|
||||
return int(explicit.group(1)), int(explicit.group(2))
|
||||
long_side = _OLLAMA_SIZE_PRESETS.get(size.upper(), _OLLAMA_DEFAULT_SIDE)
|
||||
else:
|
||||
long_side = _OLLAMA_DEFAULT_SIDE
|
||||
|
||||
if not aspect_ratio:
|
||||
return long_side, long_side
|
||||
|
||||
ratio = _OLLAMA_ASPECT_RATIO_RE.fullmatch(aspect_ratio.strip())
|
||||
if ratio is None:
|
||||
return long_side, long_side
|
||||
|
||||
width_ratio = int(ratio.group(1))
|
||||
height_ratio = int(ratio.group(2))
|
||||
if width_ratio <= 0 or height_ratio <= 0:
|
||||
return long_side, long_side
|
||||
|
||||
if width_ratio >= height_ratio:
|
||||
width = long_side
|
||||
height = _round_to_multiple(long_side * height_ratio / width_ratio)
|
||||
else:
|
||||
height = long_side
|
||||
width = _round_to_multiple(long_side * width_ratio / height_ratio)
|
||||
return max(8, width), max(8, height)
|
||||
|
||||
|
||||
def _ollama_image_data_url(value: str) -> str:
|
||||
if value.startswith("data:image/"):
|
||||
return value
|
||||
return _b64_image_data_url(value)
|
||||
|
||||
|
||||
def _ollama_images_from_payload(payload: dict[str, Any]) -> list[str]:
|
||||
images: list[str] = []
|
||||
|
||||
def collect(value: Any) -> None:
|
||||
if isinstance(value, str) and value:
|
||||
images.append(_ollama_image_data_url(value))
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
collect(item)
|
||||
|
||||
collect(payload.get("image"))
|
||||
collect(payload.get("images"))
|
||||
return images
|
||||
|
||||
|
||||
class OllamaImageGenerationClient(ImageGenerationProvider):
|
||||
"""Async client for Ollama native image generation models."""
|
||||
|
||||
provider_name = "ollama"
|
||||
default_timeout = 300.0
|
||||
|
||||
def _default_base_url(self) -> str:
|
||||
return "http://localhost:11434/api"
|
||||
|
||||
def _resolve_base_url(self, api_base: str | None) -> str:
|
||||
if api_base:
|
||||
base = api_base.rstrip("/")
|
||||
if base.endswith("/v1"):
|
||||
return f"{base[:-3]}/api"
|
||||
return base
|
||||
return self._default_base_url()
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
*,
|
||||
prompt: str,
|
||||
model: str,
|
||||
reference_images: list[str] | None = None,
|
||||
aspect_ratio: str | None = None,
|
||||
image_size: str | None = None,
|
||||
) -> GeneratedImageResponse:
|
||||
if reference_images:
|
||||
raise ImageGenerationError(
|
||||
"Ollama image generation does not support reference images"
|
||||
)
|
||||
|
||||
width, height = _ollama_dimensions(aspect_ratio, image_size)
|
||||
body: dict[str, Any] = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"steps": 0,
|
||||
}
|
||||
body.update(self.extra_body)
|
||||
body["stream"] = False
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
**self.extra_headers,
|
||||
}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
url = f"{self.api_base}/generate"
|
||||
response = await self._http_post(url, headers=headers, body=body)
|
||||
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as exc:
|
||||
detail = _http_error_detail(response)
|
||||
logger.error(
|
||||
"Ollama image generation failed (HTTP {}): {}",
|
||||
response.status_code,
|
||||
detail,
|
||||
)
|
||||
raise ImageGenerationError(
|
||||
f"Ollama image generation failed (HTTP {response.status_code}): {detail}"
|
||||
) from exc
|
||||
|
||||
data = response.json()
|
||||
images = _ollama_images_from_payload(data)
|
||||
|
||||
self._require_images(images, data)
|
||||
|
||||
response_text = data.get("response")
|
||||
content = response_text if isinstance(response_text, str) else ""
|
||||
|
||||
return GeneratedImageResponse(images=images, content=content, raw=data)
|
||||
|
||||
|
||||
class GeminiImageGenerationClient(ImageGenerationProvider):
|
||||
"""Async client for Gemini/Imagen image generation via the Generative Language API."""
|
||||
|
||||
@ -886,5 +1028,6 @@ def _stepfun_images_from_payload(payload: dict[str, Any]) -> list[str]:
|
||||
register_image_gen_provider(OpenRouterImageGenerationClient)
|
||||
register_image_gen_provider(AIHubMixImageGenerationClient)
|
||||
register_image_gen_provider(GeminiImageGenerationClient)
|
||||
register_image_gen_provider(OllamaImageGenerationClient)
|
||||
register_image_gen_provider(MiniMaxImageGenerationClient)
|
||||
register_image_gen_provider(StepFunImageGenerationClient)
|
||||
|
||||
@ -13,6 +13,7 @@ from nanobot.providers.image_generation import (
|
||||
GeneratedImageResponse,
|
||||
ImageGenerationError,
|
||||
MiniMaxImageGenerationClient,
|
||||
OllamaImageGenerationClient,
|
||||
OpenRouterImageGenerationClient,
|
||||
StepFunImageGenerationClient,
|
||||
)
|
||||
@ -133,6 +134,54 @@ async def test_openrouter_image_generation_requires_api_key() -> None:
|
||||
await client.generate(prompt="draw", model="model")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_image_generation_payload_and_response() -> None:
|
||||
raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
|
||||
fake = FakeClient(FakeResponse({"image": raw_b64}))
|
||||
client = OllamaImageGenerationClient(
|
||||
api_key="ollama-test",
|
||||
api_base="http://localhost:11434/v1/",
|
||||
extra_headers={"X-Test": "1"},
|
||||
extra_body={"seed": 123},
|
||||
client=fake, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
response = await client.generate(
|
||||
prompt="a sunset",
|
||||
model="x/z-image-turbo",
|
||||
aspect_ratio="16:9",
|
||||
image_size="1K",
|
||||
)
|
||||
|
||||
assert response.images == [PNG_DATA_URL]
|
||||
assert response.content == ""
|
||||
|
||||
call = fake.calls[0]
|
||||
assert call["url"] == "http://localhost:11434/api/generate"
|
||||
assert call["headers"]["Authorization"] == "Bearer ollama-test"
|
||||
assert call["headers"]["X-Test"] == "1"
|
||||
body = call["json"]
|
||||
assert body["model"] == "x/z-image-turbo"
|
||||
assert body["prompt"] == "a sunset"
|
||||
assert body["width"] == 1024
|
||||
assert body["height"] == 576
|
||||
assert body["steps"] == 0
|
||||
assert body["stream"] is False
|
||||
assert body["seed"] == 123
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_image_generation_rejects_reference_images() -> None:
|
||||
client = OllamaImageGenerationClient(api_key=None)
|
||||
|
||||
with pytest.raises(ImageGenerationError, match="reference images"):
|
||||
await client.generate(
|
||||
prompt="edit this",
|
||||
model="x/z-image-turbo",
|
||||
reference_images=["ref.png"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aihubmix_image_generation_payload_and_response() -> None:
|
||||
raw_b64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
|
||||
|
||||
@ -138,6 +138,39 @@ async def test_generate_image_tool_reports_missing_aihubmix_key(tmp_path: Path)
|
||||
assert result.startswith("Error: AIHubMix API key is not configured")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_image_tool_allows_ollama_without_api_key(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
set_config_path(tmp_path / "config.json")
|
||||
FakeImageClient.instances = []
|
||||
monkeypatch.setattr(
|
||||
"nanobot.agent.tools.image_generation.get_image_gen_provider",
|
||||
lambda name: FakeImageClient if name == "ollama" else None,
|
||||
)
|
||||
tool = ImageGenerationTool(
|
||||
workspace=tmp_path,
|
||||
config=ImageGenerationToolConfig(
|
||||
enabled=True,
|
||||
provider="ollama",
|
||||
model="x/z-image-turbo",
|
||||
),
|
||||
provider_configs={"ollama": ProviderConfig(api_base="http://localhost:11434/v1")},
|
||||
)
|
||||
|
||||
result = await tool.execute(prompt="draw a cat")
|
||||
|
||||
payload = json.loads(result)
|
||||
assert len(payload["artifacts"]) == 1
|
||||
|
||||
fake = FakeImageClient.instances[0]
|
||||
assert fake.kwargs["api_key"] is None
|
||||
assert fake.kwargs["api_base"] == "http://localhost:11434/v1"
|
||||
assert fake.calls[0]["aspect_ratio"] == "1:1"
|
||||
assert fake.calls[0]["image_size"] == "1K"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_image_tool_rejects_reference_outside_workspace(tmp_path: Path) -> None:
|
||||
set_config_path(tmp_path / "config.json")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user