fix: keep Telegram streamed code blocks balanced

Maintainer edit: split final streamed Telegram markdown before rendering to HTML so long fenced code blocks do not produce unbalanced <pre><code> chunks while still respecting Telegram's rendered HTML limit.
This commit is contained in:
chengyongru 2026-06-09 17:57:48 +08:00 committed by Xubin Ren
parent a5a816abaf
commit ffae1dca6d
2 changed files with 62 additions and 11 deletions

View File

@ -36,9 +36,9 @@ from nanobot.utils.helpers import split_message
TELEGRAM_MAX_MESSAGE_LEN = 4000 # Telegram message character limit TELEGRAM_MAX_MESSAGE_LEN = 4000 # Telegram message character limit
# Telegram's actual API limit is 4096; we split raw markdown at 4000 as a # Telegram's actual API limit is 4096; we split raw markdown at 4000 as a
# safety margin for mid-stream edits (plain text). For _stream_end, we # safety margin for mid-stream edits (plain text). For _stream_end, we split
# convert to HTML first and then split at the true 4096-char boundary so # raw markdown into chunks whose rendered HTML fits Telegram's true 4096-char
# the final rendered message never overflows. # boundary so the final rendered message never overflows.
TELEGRAM_HTML_MAX_LEN = 4096 TELEGRAM_HTML_MAX_LEN = 4096
TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN # Max length for reply context in user message TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN # Max length for reply context in user message
@ -285,6 +285,32 @@ def _markdown_to_telegram_html(text: str) -> str:
return text return text
def _split_telegram_markdown_html(content: str, max_html_len: int) -> list[str]:
"""Split raw Telegram Markdown and return HTML chunks within Telegram's limit."""
chunks: list[str] = []
pending = _split_telegram_markdown(content, TELEGRAM_MAX_MESSAGE_LEN)
while pending:
chunk = pending.pop(0)
html = _markdown_to_telegram_html(chunk)
if len(html) <= max_html_len:
chunks.append(html)
continue
# Markdown can expand when rendered as HTML (tags/entities). Re-split
# the raw markdown with a smaller budget instead of slicing HTML tags.
next_limit = max(1, int(len(chunk) * max_html_len / len(html)) - 8)
next_limit = min(next_limit, len(chunk) - 1)
if next_limit <= 0:
chunks.extend(split_message(html, max_html_len))
continue
parts = _split_telegram_markdown(chunk, next_limit)
if len(parts) == 1 and parts[0] == chunk:
chunks.extend(split_message(html, max_html_len))
continue
pending = parts + pending
return chunks
_SEND_MAX_RETRIES = 3 _SEND_MAX_RETRIES = 3
_SEND_RETRY_BASE_DELAY = 0.5 # seconds, doubled each retry _SEND_RETRY_BASE_DELAY = 0.5 # seconds, doubled each retry
_STREAM_EDIT_INTERVAL_DEFAULT = 0.6 # min seconds between edit_message_text calls _STREAM_EDIT_INTERVAL_DEFAULT = 0.6 # min seconds between edit_message_text calls
@ -800,14 +826,9 @@ class TelegramChannel(BaseChannel):
if message_thread_id := meta.get("message_thread_id"): if message_thread_id := meta.get("message_thread_id"):
thread_kwargs["message_thread_id"] = message_thread_id thread_kwargs["message_thread_id"] = message_thread_id
raw_text = buf.text raw_text = buf.text
html = _markdown_to_telegram_html(raw_text) html_chunks = _split_telegram_markdown_html(raw_text, TELEGRAM_HTML_MAX_LEN)
if len(html) <= TELEGRAM_HTML_MAX_LEN: primary_html = html_chunks[0]
primary_html = html extra_html_chunks = html_chunks[1:]
extra_html_chunks = []
else:
html_chunks = split_message(html, TELEGRAM_HTML_MAX_LEN)
primary_html = html_chunks[0]
extra_html_chunks = html_chunks[1:]
try: try:
await self._call_with_retry( await self._call_with_retry(
self._app.bot.edit_message_text, self._app.bot.edit_message_text,

View File

@ -719,6 +719,36 @@ async def test_send_delta_stream_end_html_expansion_does_not_overflow() -> None:
assert "123" not in channel._stream_bufs assert "123" not in channel._stream_bufs
@pytest.mark.asyncio
async def test_send_delta_stream_end_splits_long_code_block_before_html_rendering() -> None:
"""Final streamed replies must not split Telegram HTML inside <pre><code>."""
channel = TelegramChannel(
TelegramConfig(enabled=True, token="123:abc", allow_from=["*"]),
MessageBus(),
)
channel._app = _FakeApp(lambda: None)
channel._app.bot.edit_message_text = AsyncMock()
channel._app.bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=99))
raw_text = "```python\n" + ("print(\"line\")\n" * 450) + "```\nDone"
channel._stream_bufs["123"] = _StreamBuf(text=raw_text, message_id=7, last_edit=0.0)
await channel.send_delta("123", "", {"_stream_end": True})
html_chunks = [
channel._app.bot.edit_message_text.call_args.kwargs.get("text", ""),
*[
call.kwargs.get("text", "")
for call in channel._app.bot.send_message.call_args_list
],
]
assert len(html_chunks) > 1
for html in html_chunks:
assert len(html) <= 4096
assert html.count("<pre><code>") == html.count("</code></pre>")
assert "123" not in channel._stream_bufs
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_send_delta_new_stream_id_replaces_stale_buffer() -> None: async def test_send_delta_new_stream_id_replaces_stale_buffer() -> None:
channel = TelegramChannel( channel = TelegramChannel(