From bb788cdb7d02e34215c7fbce93ca5b3c99c07e00 Mon Sep 17 00:00:00 2001
From: Kaloyan Tenchov <zayfod@gmail.com>
Date: Sun, 17 May 2026 22:06:18 -0400
Subject: [PATCH] feat(image-generation): add Gemini provider support

Adds GeminiImageGenerationClient covering both Imagen 4 (:predict) and
Gemini Flash (:generateContent), wires the gemini ProviderConfig through
the SDK, API server, and gateway entry points, and updates the
image-generation docs and skill. Errors from the Gemini endpoints are
logged and surface with the HTTP status and parsed message instead of an
empty string.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/image-generation.md                 |  56 +++++-
 nanobot/agent/tools/image_generation.py  |   7 +-
 nanobot/cli/commands.py                  |   2 +
 nanobot/nanobot.py                       |   1 +
 nanobot/providers/image_generation.py    | 217 ++++++++++++++++++++++-
 nanobot/skills/image-generation/SKILL.md |  21 +++
 tests/providers/test_image_generation.py | 135 ++++++++++++++
 7 files changed, 433 insertions(+), 6 deletions(-)

diff --git a/docs/image-generation.md b/docs/image-generation.md
index 5c63fddf1..cd1ac2c89 100644
--- a/docs/image-generation.md
+++ b/docs/image-generation.md
@@ -48,6 +48,28 @@ AIHubMix example:
 }
 ```
 
+Gemini example (Imagen 4):
+
+```json
+{
+  "providers": {
+    "gemini": {
+      "apiKey": "${GEMINI_API_KEY}"
+    }
+  },
+  "tools": {
+    "imageGeneration": {
+      "enabled": true,
+      "provider": "gemini",
+      "model": "imagen-4.0-generate-001",
+      "defaultAspectRatio": "1:1"
+    }
+  }
+}
+```
+
+For Gemini Flash (which supports reference-image edits) see the [Gemini](#gemini) section below.
+
 > [!TIP]
 > Prefer environment variables for API keys. nanobot resolves `${VAR_NAME}` values from the environment at startup.
 
@@ -69,7 +91,7 @@ The WebUI hides provider storage details from the user. The agent sees the saved
 | Option | Type | Default | Description |
 |--------|------|---------|-------------|
 | `tools.imageGeneration.enabled` | boolean | `false` | Register the `generate_image` tool |
-| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Currently `openrouter` and `aihubmix` are supported |
+| `tools.imageGeneration.provider` | string | `"openrouter"` | Image provider name. Supported values: `openrouter`, `aihubmix`, `gemini` |
 | `tools.imageGeneration.model` | string | `"openai/gpt-5.4-image-2"` | Provider model name |
 | `tools.imageGeneration.defaultAspectRatio` | string | `"1:1"` | Default ratio when the prompt/tool call does not specify one |
 | `tools.imageGeneration.defaultImageSize` | string | `"1K"` | Default size hint, for example `1K`, `2K`, `4K`, or `1024x1024` |
@@ -139,6 +161,36 @@ Configure:
 
 `quality: low` is optional. It can make free image models faster and less likely to time out, but it is not required for correctness.
 
+### Gemini
+
+nanobot supports two Gemini image generation model families via Google's Generative Language API:
+
+| Model | Endpoint | Reference images |
+|-------|----------|-----------------|
+| `imagen-4.0-generate-001` | `:predict` | Not supported by this integration |
+| `gemini-2.5-flash-image` | `:generateContent` | Supported |
+
+For reference-image edits, use a Gemini Flash image model:
+
+```json
+{
+  "providers": {
+    "gemini": {
+      "apiKey": "${GEMINI_API_KEY}"
+    }
+  },
+  "tools": {
+    "imageGeneration": {
+      "enabled": true,
+      "provider": "gemini",
+      "model": "gemini-2.5-flash-image"
+    }
+  }
+}
+```
+
+Imagen 4 supports the aspect ratios `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`. Unsupported ratios are ignored and the model uses its default. The `defaultImageSize` setting has no effect on Gemini models; sizing is controlled by `defaultAspectRatio` only. Reference images passed with an Imagen model are ignored (with a warning logged).
+
 ## Artifacts
 
 Generated images are stored under the active nanobot instance's media directory:
@@ -193,7 +245,7 @@ Use the reference image. Keep the same robot and composition, change the palette
 |---------|-------|
 | `generate_image` is not available | Set `tools.imageGeneration.enabled` to `true` and restart the gateway |
 | Missing API key error | Configure `providers.<provider>.apiKey`; if using `${VAR_NAME}`, confirm the environment variable is visible to the gateway process |
-| `unsupported image generation provider` | Use `openrouter` or `aihubmix` |
+| `unsupported image generation provider` | Use `openrouter`, `aihubmix`, or `gemini` |
 | AIHubMix says `Incorrect model ID` | Use `model: "gpt-image-2-free"`; nanobot expands it to the required `openai/gpt-image-2-free` model path internally |
 | Generation times out | Try a smaller/default image size, set AIHubMix `extraBody.quality` to `"low"`, or retry later |
 | Reference image rejected | Reference image paths must be inside the workspace or nanobot media directory and must be valid image files |
diff --git a/nanobot/agent/tools/image_generation.py b/nanobot/agent/tools/image_generation.py
index eedbecadc..3dec8eb92 100644
--- a/nanobot/agent/tools/image_generation.py
+++ b/nanobot/agent/tools/image_generation.py
@@ -18,6 +18,7 @@ from nanobot.config.paths import get_media_dir
 from nanobot.config.schema import Base
 from nanobot.providers.image_generation import (
     AIHubMixImageGenerationClient,
+    GeminiImageGenerationClient,
     ImageGenerationError,
     MiniMaxImageGenerationClient,
     OpenRouterImageGenerationClient,
@@ -120,7 +121,7 @@ class ImageGenerationTool(Tool):
 
     def _provider_client(
         self,
-    ) -> OpenRouterImageGenerationClient | AIHubMixImageGenerationClient | MiniMaxImageGenerationClient | None:
+    ) -> OpenRouterImageGenerationClient | AIHubMixImageGenerationClient | MiniMaxImageGenerationClient | GeminiImageGenerationClient | None:
         provider = self._provider_config()
         kwargs = {
             "api_key": provider.api_key if provider else None,
@@ -134,6 +135,8 @@ class ImageGenerationTool(Tool):
             return AIHubMixImageGenerationClient(**kwargs)
         if self.config.provider == "minimax":
             return MiniMaxImageGenerationClient(**kwargs)
+        if self.config.provider == "gemini":
+            return GeminiImageGenerationClient(**kwargs)
         return None
 
     def _missing_api_key_error(self) -> str:
@@ -144,6 +147,8 @@ class ImageGenerationTool(Tool):
             return "Error: AIHubMix API key is not configured. Set providers.aihubmix.apiKey."
         if provider == "minimax":
             return "Error: MiniMax API key is not configured. Set providers.minimax.apiKey."
+        if provider == "gemini":
+            return "Error: Gemini API key is not configured. Set providers.gemini.apiKey."
         return f"Error: {provider} API key is not configured."
 
     def _resolve_reference_image(self, value: str) -> str:
diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py
index bd5e7f453..cedc03bd0 100644
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -643,6 +643,7 @@ def serve(
                 "openrouter": runtime_config.providers.openrouter,
                 "aihubmix": runtime_config.providers.aihubmix,
                 "minimax": runtime_config.providers.minimax,
+                "gemini": runtime_config.providers.gemini,
             },
         )
     except ValueError as exc:
@@ -757,6 +758,7 @@ def _run_gateway(
             "openrouter": config.providers.openrouter,
             "aihubmix": config.providers.aihubmix,
             "minimax": config.providers.minimax,
+            "gemini": config.providers.gemini,
         },
         provider_snapshot_loader=load_provider_snapshot,
         runtime_model_publisher=lambda model, preset: publish_runtime_model_update(
diff --git a/nanobot/nanobot.py b/nanobot/nanobot.py
index 09c58de05..527f81b16 100644
--- a/nanobot/nanobot.py
+++ b/nanobot/nanobot.py
@@ -67,6 +67,7 @@ class Nanobot:
                 "openrouter": config.providers.openrouter,
                 "aihubmix": config.providers.aihubmix,
                 "minimax": config.providers.minimax,
+                "gemini": config.providers.gemini,
             },
         )
         return cls(loop)
diff --git a/nanobot/providers/image_generation.py b/nanobot/providers/image_generation.py
index dc74c62f3..1b0c5189d 100644
--- a/nanobot/providers/image_generation.py
+++ b/nanobot/providers/image_generation.py
@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import Any
 
 import httpx
+from loguru import logger
 
 from nanobot.providers.registry import find_by_name
 from nanobot.utils.helpers import detect_image_mime
@@ -26,6 +27,8 @@ _AIHUBMIX_ASPECT_RATIO_SIZES = {
     "4:3": "1536x1024",
     "16:9": "1536x1024",
 }
+_GEMINI_DEFAULT_TIMEOUT_S = 120.0
+_GEMINI_IMAGEN_ASPECT_RATIOS = {"1:1", "9:16", "16:9", "3:4", "4:3"}
 
 
 class ImageGenerationError(RuntimeError):
@@ -50,17 +53,28 @@ def _provider_base_url(provider: str, api_base: str | None, fallback: str) -> st
     return fallback
 
 
-def image_path_to_data_url(path: str | Path) -> str:
-    """Convert a local image path to an image data URL."""
+def _read_image_b64(path: str | Path) -> tuple[str, str]:
+    """Return ``(mime, base64)`` for the image at ``path``."""
     p = Path(path).expanduser()
     raw = p.read_bytes()
     mime = detect_image_mime(raw)
     if mime is None:
         raise ImageGenerationError(f"unsupported reference image: {p}")
-    encoded = base64.b64encode(raw).decode("ascii")
+    return mime, base64.b64encode(raw).decode("ascii")
+
+
+def image_path_to_data_url(path: str | Path) -> str:
+    """Convert a local image path to an image data URL."""
+    mime, encoded = _read_image_b64(path)
     return f"data:{mime};base64,{encoded}"
 
 
+def image_path_to_inline_data(path: str | Path) -> dict[str, str]:
+    """Convert a local image path to a Gemini ``inlineData`` payload dict."""
+    mime, encoded = _read_image_b64(path)
+    return {"mimeType": mime, "data": encoded}
+
+
 def _b64_png_data_url(value: str) -> str:
     return f"data:image/png;base64,{value}"
 
@@ -341,6 +355,203 @@ class AIHubMixImageGenerationClient:
         return GeneratedImageResponse(images=images, content="", raw=payload)
 
 
+def _http_error_detail(response: httpx.Response) -> str:
+    """Extract a readable error message from an HTTP error response."""
+    try:
+        data = response.json()
+        if isinstance(data, dict):
+            err = data.get("error")
+            if isinstance(err, dict):
+                return err.get("message") or str(err)
+            if err:
+                return str(err)
+    except Exception:
+        pass
+    return response.text[:500] or "<empty response body>"
+
+
+class GeminiImageGenerationClient:
+    """Async client for Gemini/Imagen image generation via the Generative Language API."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None,
+        api_base: str | None = None,
+        extra_headers: dict[str, str] | None = None,
+        extra_body: dict[str, Any] | None = None,
+        timeout: float = _GEMINI_DEFAULT_TIMEOUT_S,
+        client: httpx.AsyncClient | None = None,
+    ) -> None:
+        self.api_key = api_key
+        # The Gemini provider's registry default_api_base is the OpenAI-compat
+        # shim (.../v1beta/openai/), which has no image endpoints. Image
+        # generation needs the native Generative Language API base, so we don't
+        # use _provider_base_url() here.
+        self.api_base = (
+            api_base or "https://generativelanguage.googleapis.com/v1beta"
+        ).rstrip("/")
+        self.extra_headers = extra_headers or {}
+        self.extra_body = extra_body or {}
+        self.timeout = timeout
+        self._client = client
+
+    async def generate(
+        self,
+        *,
+        prompt: str,
+        model: str,
+        reference_images: list[str] | None = None,
+        aspect_ratio: str | None = None,
+        image_size: str | None = None,
+    ) -> GeneratedImageResponse:
+        if not self.api_key:
+            raise ImageGenerationError(
+                "Gemini API key is not configured. Set providers.gemini.apiKey."
+            )
+        if "imagen" in model.lower():
+            if reference_images:
+                logger.warning(
+                    "Imagen models do not support reference images; "
+                    "ignoring {} reference image(s) for {}",
+                    len(reference_images),
+                    model,
+                )
+            return await self._generate_imagen(
+                prompt=prompt, model=model, aspect_ratio=aspect_ratio
+            )
+        return await self._generate_gemini_flash(
+            prompt=prompt, model=model, reference_images=reference_images or []
+        )
+
+    async def _generate_imagen(
+        self,
+        *,
+        prompt: str,
+        model: str,
+        aspect_ratio: str | None,
+    ) -> GeneratedImageResponse:
+        parameters: dict[str, Any] = {"sampleCount": 1}
+        if aspect_ratio in _GEMINI_IMAGEN_ASPECT_RATIOS:
+            parameters["aspectRatio"] = aspect_ratio
+        body: dict[str, Any] = {
+            "instances": [{"prompt": prompt}],
+            "parameters": parameters,
+        }
+        body.update(self.extra_body)
+
+        url = f"{self.api_base}/models/{model}:predict"
+        headers = {
+            "x-goog-api-key": self.api_key or "",
+            "Content-Type": "application/json",
+            **self.extra_headers,
+        }
+
+        if self._client is not None:
+            response = await self._client.post(url, headers=headers, json=body)
+        else:
+            async with httpx.AsyncClient(timeout=self.timeout) as client:
+                response = await client.post(url, headers=headers, json=body)
+
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            detail = _http_error_detail(response)
+            logger.error("Gemini Imagen generation failed (HTTP {}): {}", response.status_code, detail)
+            raise ImageGenerationError(
+                f"Gemini Imagen generation failed (HTTP {response.status_code}): {detail}"
+            ) from exc
+
+        data = response.json()
+        images: list[str] = []
+        for prediction in data.get("predictions") or []:
+            if not isinstance(prediction, dict):
+                continue
+            b64 = prediction.get("bytesBase64Encoded")
+            mime = prediction.get("mimeType", "image/png")
+            if isinstance(b64, str) and b64:
+                images.append(f"data:{mime};base64,{b64}")
+
+        if not images:
+            provider_error = data.get("error") if isinstance(data, dict) else None
+            if provider_error:
+                raise ImageGenerationError(f"Gemini Imagen returned no images: {provider_error}")
+            raise ImageGenerationError("Gemini Imagen returned no images for this request")
+
+        return GeneratedImageResponse(images=images, content="", raw=data)
+
+    async def _generate_gemini_flash(
+        self,
+        *,
+        prompt: str,
+        model: str,
+        reference_images: list[str],
+    ) -> GeneratedImageResponse:
+        parts: list[dict[str, Any]] = [
+            {"inlineData": image_path_to_inline_data(path)} for path in reference_images
+        ]
+        parts.append({"text": prompt})
+
+        body: dict[str, Any] = {
+            "contents": [{"role": "user", "parts": parts}],
+            "generationConfig": {"responseModalities": ["TEXT", "IMAGE"]},
+        }
+        body.update(self.extra_body)
+
+        url = f"{self.api_base}/models/{model}:generateContent"
+        headers = {
+            "x-goog-api-key": self.api_key or "",
+            "Content-Type": "application/json",
+            **self.extra_headers,
+        }
+
+        if self._client is not None:
+            response = await self._client.post(url, headers=headers, json=body)
+        else:
+            async with httpx.AsyncClient(timeout=self.timeout) as client:
+                response = await client.post(url, headers=headers, json=body)
+
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            detail = _http_error_detail(response)
+            logger.error("Gemini image generation failed (HTTP {}): {}", response.status_code, detail)
+            raise ImageGenerationError(
+                f"Gemini image generation failed (HTTP {response.status_code}): {detail}"
+            ) from exc
+
+        data = response.json()
+        images: list[str] = []
+        text_parts: list[str] = []
+        for candidate in data.get("candidates") or []:
+            if not isinstance(candidate, dict):
+                continue
+            content = candidate.get("content") or {}
+            for part in content.get("parts") or []:
+                if not isinstance(part, dict):
+                    continue
+                if "text" in part:
+                    text_parts.append(part["text"])
+                inline = part.get("inlineData")
+                if isinstance(inline, dict):
+                    mime = inline.get("mimeType", "image/png")
+                    b64 = inline.get("data", "")
+                    if b64:
+                        images.append(f"data:{mime};base64,{b64}")
+
+        if not images:
+            provider_error = data.get("error") if isinstance(data, dict) else None
+            if provider_error:
+                raise ImageGenerationError(f"Gemini returned no images: {provider_error}")
+            raise ImageGenerationError("Gemini returned no images for this request")
+
+        return GeneratedImageResponse(
+            images=images,
+            content="\n".join(t for t in text_parts if t).strip(),
+            raw=data,
+        )
+
+
 async def _aihubmix_images_from_payload(
     client: httpx.AsyncClient,
     payload: dict[str, Any],
diff --git a/nanobot/skills/image-generation/SKILL.md b/nanobot/skills/image-generation/SKILL.md
index 3ba0e2f45..f0309e68b 100644
--- a/nanobot/skills/image-generation/SKILL.md
+++ b/nanobot/skills/image-generation/SKILL.md
@@ -88,6 +88,27 @@ AIHubMix `gpt-image-2-free` uses AIHubMix's unified predictions endpoint interna
 
 `providers.aihubmix.extraBody` can be used for provider-specific options. For example, `"extraBody": {"quality": "low"}` is optional but can make `gpt-image-2-free` faster and less likely to time out.
 
+For Gemini, the image tool supports two model families. Imagen 4 (`imagen-4.0-generate-001`) supports text-to-image only. Gemini Flash (`gemini-2.5-flash-image`) also supports reference-image edits. Configuration:
+
+```json
+{
+  "providers": {
+    "gemini": {
+      "apiKey": "AIza..."
+    }
+  },
+  "tools": {
+    "imageGeneration": {
+      "enabled": true,
+      "provider": "gemini",
+      "model": "imagen-4.0-generate-001"
+    }
+  }
+}
+```
+
+For Gemini models, `defaultImageSize` has no effect; use `defaultAspectRatio` instead. Imagen 4 supports `1:1`, `9:16`, `16:9`, `3:4`, and `4:3`.
+
 ## Examples
 
 Generate a new image:
diff --git a/tests/providers/test_image_generation.py b/tests/providers/test_image_generation.py
index 8f2801d68..bea317d22 100644
--- a/tests/providers/test_image_generation.py
+++ b/tests/providers/test_image_generation.py
@@ -8,6 +8,7 @@ import pytest
 
 from nanobot.providers.image_generation import (
     AIHubMixImageGenerationClient,
+    GeminiImageGenerationClient,
     GeneratedImageResponse,
     ImageGenerationError,
     OpenRouterImageGenerationClient,
@@ -202,3 +203,137 @@ async def test_aihubmix_image_generation_downloads_url_response() -> None:
 
     assert response.images[0].startswith("data:image/png;base64,")
     assert fake.get_calls[0]["url"] == "https://cdn.example/image.png"
+
+
+RAW_B64 = PNG_DATA_URL.removeprefix("data:image/png;base64,")
+
+
+@pytest.mark.asyncio
+async def test_gemini_imagen_payload_and_response() -> None:
+    fake = FakeClient(
+        FakeResponse({"predictions": [{"bytesBase64Encoded": RAW_B64, "mimeType": "image/png"}]})
+    )
+    client = GeminiImageGenerationClient(
+        api_key="AIza-test",
+        api_base="https://generativelanguage.googleapis.com/v1beta",
+        client=fake,  # type: ignore[arg-type]
+    )
+
+    response = await client.generate(
+        prompt="a sunset",
+        model="imagen-4.0-generate-001",
+        aspect_ratio="16:9",
+    )
+
+    assert response.images == [PNG_DATA_URL]
+    assert response.content == ""
+    call = fake.calls[0]
+    assert call["url"].endswith(":predict")
+    assert call["headers"]["x-goog-api-key"] == "AIza-test"
+    assert "params" not in call
+    body = call["json"]
+    assert body["instances"] == [{"prompt": "a sunset"}]
+    assert body["parameters"]["sampleCount"] == 1
+    assert body["parameters"]["aspectRatio"] == "16:9"
+
+
+@pytest.mark.asyncio
+async def test_gemini_imagen_ignores_unsupported_aspect_ratio() -> None:
+    fake = FakeClient(
+        FakeResponse({"predictions": [{"bytesBase64Encoded": RAW_B64, "mimeType": "image/png"}]})
+    )
+    client = GeminiImageGenerationClient(api_key="AIza-test", client=fake)  # type: ignore[arg-type]
+
+    await client.generate(prompt="a sunset", model="imagen-4.0-generate-001", aspect_ratio="2:3")
+
+    body = fake.calls[0]["json"]
+    assert "aspectRatio" not in body["parameters"]
+
+
+@pytest.mark.asyncio
+async def test_gemini_flash_payload_and_response() -> None:
+    fake = FakeClient(
+        FakeResponse(
+            {
+                "candidates": [
+                    {
+                        "content": {
+                            "parts": [
+                                {"text": "here is your image"},
+                                {"inlineData": {"mimeType": "image/png", "data": RAW_B64}},
+                            ]
+                        }
+                    }
+                ]
+            }
+        )
+    )
+    client = GeminiImageGenerationClient(
+        api_key="AIza-test",
+        api_base="https://generativelanguage.googleapis.com/v1beta",
+        client=fake,  # type: ignore[arg-type]
+    )
+
+    response = await client.generate(
+        prompt="draw a cat",
+        model="gemini-2.0-flash-preview-image-generation",
+    )
+
+    assert response.images == [PNG_DATA_URL]
+    assert response.content == "here is your image"
+    call = fake.calls[0]
+    assert call["url"].endswith(":generateContent")
+    assert call["headers"]["x-goog-api-key"] == "AIza-test"
+    assert "params" not in call
+    body = call["json"]
+    assert body["generationConfig"]["responseModalities"] == ["TEXT", "IMAGE"]
+    assert body["contents"][0]["parts"][-1] == {"text": "draw a cat"}
+
+
+@pytest.mark.asyncio
+async def test_gemini_flash_reference_images(tmp_path: Path) -> None:
+    ref = tmp_path / "ref.png"
+    ref.write_bytes(PNG_BYTES)
+    fake = FakeClient(
+        FakeResponse(
+            {
+                "candidates": [
+                    {
+                        "content": {
+                            "parts": [{"inlineData": {"mimeType": "image/png", "data": RAW_B64}}]
+                        }
+                    }
+                ]
+            }
+        )
+    )
+    client = GeminiImageGenerationClient(api_key="AIza-test", client=fake)  # type: ignore[arg-type]
+
+    response = await client.generate(
+        prompt="edit this",
+        model="gemini-2.0-flash-preview-image-generation",
+        reference_images=[str(ref)],
+    )
+
+    assert response.images == [PNG_DATA_URL]
+    parts = fake.calls[0]["json"]["contents"][0]["parts"]
+    assert parts[0]["inlineData"]["mimeType"] == "image/png"
+    assert parts[0]["inlineData"]["data"].startswith("iVBOR")
+    assert parts[1] == {"text": "edit this"}
+
+
+@pytest.mark.asyncio
+async def test_gemini_requires_api_key() -> None:
+    client = GeminiImageGenerationClient(api_key=None)
+
+    with pytest.raises(ImageGenerationError, match="API key"):
+        await client.generate(prompt="draw", model="imagen-4.0-generate-001")
+
+
+@pytest.mark.asyncio
+async def test_gemini_no_images_raises() -> None:
+    fake = FakeClient(FakeResponse({"candidates": [{"content": {"parts": [{"text": "sorry"}]}}]}))
+    client = GeminiImageGenerationClient(api_key="AIza-test", client=fake)  # type: ignore[arg-type]
+
+    with pytest.raises(ImageGenerationError, match="returned no images"):
+        await client.generate(prompt="draw", model="gemini-2.0-flash-preview-image-generation")