From ef2ef4f789e29f5c75a4eeea782c5aa3d84af872 Mon Sep 17 00:00:00 2001 From: 04cb <0x04cb@gmail.com> Date: Sat, 23 May 2026 06:00:28 +0800 Subject: [PATCH] fix(transcription): normalize chat-style apiBase to audio endpoint (#3637) --- nanobot/providers/transcription.py | 35 +++++++++++++++++------ tests/providers/test_transcription.py | 40 ++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py index 9adf2e6d2..8a21d29a2 100644 --- a/nanobot/providers/transcription.py +++ b/nanobot/providers/transcription.py @@ -7,6 +7,25 @@ from pathlib import Path import httpx from loguru import logger +_TRANSCRIPTIONS_PATH = "audio/transcriptions" + + +def _resolve_transcription_url(api_base: str | None, default_url: str) -> str: + """Resolve the full transcription endpoint URL. + + Accepts either a chat-style base (e.g. ``https://api.groq.com/openai/v1``) + or a complete URL already ending in ``/audio/transcriptions``. A chat-style + base — the form users naturally copy from their LLM provider config — gets + the path appended instead of being POSTed verbatim and 404ing (#3637). + """ + if not api_base: + return default_url + base = api_base.rstrip("/") + if base.endswith(_TRANSCRIPTIONS_PATH): + return base + return f"{base}/{_TRANSCRIPTIONS_PATH}" + + # Up to 3 retries (4 attempts total) with exponential backoff on transient # failures. Whisper endpoints occasionally return 502/503 under load, and # mobile-network transcription callers hit sporadic connect/read errors. @@ -127,12 +146,12 @@ class OpenAITranscriptionProvider: language: str | None = None, ): self.api_key = api_key or os.environ.get("OPENAI_API_KEY") - self.api_url = ( - api_base - or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL") - or "https://api.openai.com/v1/audio/transcriptions" + self.api_url = _resolve_transcription_url( + api_base or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL"), + "https://api.openai.com/v1/audio/transcriptions", ) self.language = language or None + logger.debug("OpenAI transcription endpoint: {}", self.api_url) async def transcribe(self, file_path: str | Path) -> str: if not self.api_key: @@ -166,12 +185,12 @@ class GroqTranscriptionProvider: language: str | None = None, ): self.api_key = api_key or os.environ.get("GROQ_API_KEY") - self.api_url = ( - api_base - or os.environ.get("GROQ_BASE_URL") - or "https://api.groq.com/openai/v1/audio/transcriptions" + self.api_url = _resolve_transcription_url( + api_base or os.environ.get("GROQ_BASE_URL"), + "https://api.groq.com/openai/v1/audio/transcriptions", ) self.language = language or None + logger.debug("Groq transcription endpoint: {}", self.api_url) async def transcribe(self, file_path: str | Path) -> str: """ diff --git a/tests/providers/test_transcription.py b/tests/providers/test_transcription.py index 5fd10d552..14a784b2e 100644 --- a/tests/providers/test_transcription.py +++ b/tests/providers/test_transcription.py @@ -8,7 +8,11 @@ from unittest.mock import AsyncMock, patch import httpx import pytest -from nanobot.providers.transcription import GroqTranscriptionProvider, OpenAITranscriptionProvider +from nanobot.providers.transcription import ( + GroqTranscriptionProvider, + OpenAITranscriptionProvider, + _resolve_transcription_url, +) @pytest.fixture @@ -290,3 +294,37 @@ async def test_retries_on_every_advertised_transient_exception( result = await provider.transcribe(audio_file) assert result == "recovered" assert post.await_count == 2 + + +# --------------------------------------------------------------------------- +# apiBase normalization (#3637): a chat-style base must not be POSTed verbatim +# --------------------------------------------------------------------------- + + +def test_resolve_transcription_url_falls_back_to_default() -> None: + default = "https://api.openai.com/v1/audio/transcriptions" + assert _resolve_transcription_url(None, default) == default + assert _resolve_transcription_url("", default) == default + + +def test_resolve_transcription_url_appends_path_to_chat_style_base() -> None: + assert ( + _resolve_transcription_url("https://api.groq.com/openai/v1", "https://x/audio/transcriptions") + == "https://api.groq.com/openai/v1/audio/transcriptions" + ) + # Trailing slash must not produce a doubled separator. + assert ( + _resolve_transcription_url("https://api.groq.com/openai/v1/", "https://x/audio/transcriptions") + == "https://api.groq.com/openai/v1/audio/transcriptions" + ) + + +def test_resolve_transcription_url_keeps_full_endpoint() -> None: + full = "https://api.groq.com/openai/v1/audio/transcriptions" + assert _resolve_transcription_url(full, "https://x/audio/transcriptions") == full + + +def test_groq_provider_normalizes_chat_style_api_base() -> None: + """Regression for #3637: apiBase set to the v1 base resolves to the audio endpoint.""" + provider = GroqTranscriptionProvider(api_key="gsk-test", api_base="https://api.groq.com/openai/v1") + assert provider.api_url == "https://api.groq.com/openai/v1/audio/transcriptions"