From a5a816abaf10b736c664a6b3bc2b282b0fc58175 Mon Sep 17 00:00:00 2001 From: axelray-dev <110029405+axelray-dev@users.noreply.github.com> Date: Tue, 9 Jun 2026 14:37:14 +0800 Subject: [PATCH] fix(telegram): move fenced-code-block splitting into Telegram-specific helper Move the fenced-code-block-aware splitting logic out of the shared split_message helper (used by Signal, Slack, Discord, Weixin, etc.) and into a Telegram-specific _split_telegram_markdown function. The shared split_message remains a plain-text chunker. The Telegram channel now uses _split_telegram_markdown for its raw Markdown paths that feed _markdown_to_telegram_html, preventing broken HTML rendering when splits fall inside fenced code blocks. Also fixes a regression where content beginning with whitespace before a fence could emit a whitespace-only chunk. Addresses review feedback on #4257. --- nanobot/channels/telegram.py | 77 ++++++++++++++++++++++++- nanobot/utils/helpers.py | 46 --------------- tests/channels/test_telegram_channel.py | 63 ++++++++++++++++++++ tests/utils/test_helpers.py | 53 ----------------- 4 files changed, 138 insertions(+), 101 deletions(-) diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py index 9a9ec9bbd..9d3eafed1 100644 --- a/nanobot/channels/telegram.py +++ b/nanobot/channels/telegram.py @@ -43,6 +43,79 @@ TELEGRAM_HTML_MAX_LEN = 4096 TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN # Max length for reply context in user message +def _split_telegram_markdown(content: str, max_len: int) -> list[str]: + """Split raw Telegram Markdown without leaving fenced code blocks unbalanced.""" + if not content: + return [] + content = content.lstrip() + if not content: + return [] + if len(content) <= max_len: + return [content] + + def fence_line(fence_pos: int) -> str: + line_end = content.find("\n", fence_pos) + if line_end < 0: + return content[fence_pos:] + return content[fence_pos:line_end] + + def split_inside_fenced_code_block(pos: int) -> tuple[bool, int, str]: + if content[:pos].count("```") % 2 == 0: + return False, -1, "" + opening = content.rfind("```", 0, pos) + if opening < 0: + return True, -1, "```" + return True, opening, fence_line(opening) + + chunks: list[str] = [] + while content: + if len(content) <= max_len: + chunks.append(content) + break + + cut = content[:max_len] + pos = cut.rfind("\n") + if pos <= 0: + pos = cut.rfind(" ") + if pos <= 0: + pos = max_len + + inside_code, opening, fence = split_inside_fenced_code_block(pos) + if inside_code: + if opening > 0: + pos = opening + else: + closing = "\n```" + min_code_pos = len(fence) + if content.startswith(fence + "\n"): + min_code_pos += 1 + if pos < min_code_pos and min_code_pos + len(closing) > max_len: + chunks.append(content[:max_len]) + content = content[max_len:].lstrip() + continue + if pos + len(closing) > max_len: + budget = max_len - len(closing) + if budget > 0: + recut = content[:budget] + adjusted = recut.rfind("\n") + if adjusted <= 0: + adjusted = recut.rfind(" ") + pos = adjusted if adjusted > 0 else budget + else: + closing = "```" + pos = max_len - len(closing) + chunks.append(content[:pos] + closing) + remainder = content[pos:] + if remainder.startswith("\n"): + remainder = remainder[1:] + content = f"{fence}\n{remainder}" + continue + + chunks.append(content[:pos]) + content = content[pos:].lstrip() + return chunks + + def _escape_telegram_html(text: str) -> str: """Escape text for Telegram HTML parse mode.""" return text.replace("&", "&").replace("<", "<").replace(">", ">") @@ -632,7 +705,7 @@ class TelegramChannel(BaseChannel): # Fallback: no native keyboard → splice labels into the message so the choices survive. if buttons and reply_markup is None: text = f"{text}\n\n{self._buttons_as_text(buttons)}" - chunks = split_message(text, TELEGRAM_MAX_MESSAGE_LEN) + chunks = _split_telegram_markdown(text, TELEGRAM_MAX_MESSAGE_LEN) for i, chunk in enumerate(chunks): is_last = (i == len(chunks) - 1) await self._send_text( @@ -838,7 +911,7 @@ class TelegramChannel(BaseChannel): intermediate chunks as standalone messages, then opens a new message for the tail so subsequent deltas continue streaming into it. """ - chunks = split_message(buf.text, TELEGRAM_MAX_MESSAGE_LEN) + chunks = _split_telegram_markdown(buf.text, TELEGRAM_MAX_MESSAGE_LEN) if len(chunks) <= 1: return try: diff --git a/nanobot/utils/helpers.py b/nanobot/utils/helpers.py index 181cea9ca..6341bc2bc 100644 --- a/nanobot/utils/helpers.py +++ b/nanobot/utils/helpers.py @@ -368,22 +368,6 @@ def maybe_persist_tool_result( ) -def _fence_line(content: str, fence_pos: int) -> str: - line_end = content.find("\n", fence_pos) - if line_end < 0: - return content[fence_pos:] - return content[fence_pos:line_end] - - -def _split_inside_fenced_code_block(content: str, pos: int) -> tuple[bool, int, str]: - if content[:pos].count("```") % 2 == 0: - return False, -1, "" - opening = content.rfind("```", 0, pos) - if opening < 0: - return True, -1, "```" - return True, opening, _fence_line(content, opening) - - def split_message(content: str, max_len: int = 2000) -> list[str]: """ Split content into chunks within max_len, preferring line breaks. @@ -411,36 +395,6 @@ def split_message(content: str, max_len: int = 2000) -> list[str]: pos = cut.rfind(" ") if pos <= 0: pos = max_len - inside_code, opening, fence = _split_inside_fenced_code_block(content, pos) - if inside_code: - if opening > 0: - pos = opening - else: - closing = "\n```" - min_code_pos = len(fence) - if content.startswith(fence + "\n"): - min_code_pos += 1 - if pos < min_code_pos and min_code_pos + len(closing) > max_len: - chunks.append(content[:max_len]) - content = content[max_len:].lstrip() - continue - if pos + len(closing) > max_len: - budget = max_len - len(closing) - if budget > 0: - recut = content[:budget] - adjusted = recut.rfind("\n") - if adjusted <= 0: - adjusted = recut.rfind(" ") - pos = adjusted if adjusted > 0 else budget - else: - closing = "```" - pos = max_len - len(closing) - chunks.append(content[:pos] + closing) - remainder = content[pos:] - if remainder.startswith("\n"): - remainder = remainder[1:] - content = f"{fence}\n{remainder}" - continue chunks.append(content[:pos]) content = content[pos:].lstrip() return chunks diff --git a/tests/channels/test_telegram_channel.py b/tests/channels/test_telegram_channel.py index 9b66d58be..5115791d9 100644 --- a/tests/channels/test_telegram_channel.py +++ b/tests/channels/test_telegram_channel.py @@ -17,6 +17,8 @@ from nanobot.channels.telegram import ( TELEGRAM_REPLY_CONTEXT_MAX_LEN, TelegramChannel, TelegramConfig, + _markdown_to_telegram_html, + _split_telegram_markdown, _StreamBuf, ) @@ -179,6 +181,67 @@ def _make_telegram_update( return SimpleNamespace(message=message, effective_user=user) +def _assert_code_blocks_render_balanced(chunks: list[str]) -> None: + for chunk in chunks: + html = _markdown_to_telegram_html(chunk) + assert html.count("
") == html.count("
") + + +def test_split_telegram_markdown_inside_code_block_moves_before_fence() -> None: + content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone" + + chunks = _split_telegram_markdown(content, max_len=35) + + assert chunks[0] == "Intro paragraph.\n" + assert chunks[1].startswith("```python\nprint('a')") + _assert_code_blocks_render_balanced(chunks) + + +def test_split_telegram_markdown_long_code_block_closes_and_reopens() -> None: + content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone" + + chunks = _split_telegram_markdown(content, max_len=60) + + assert len(chunks) > 1 + assert all(len(chunk) <= 60 for chunk in chunks) + assert chunks[0].startswith("```python\n") + assert chunks[0].endswith("\n```") + assert chunks[1].startswith("```python\n") + _assert_code_blocks_render_balanced(chunks) + + +def test_split_telegram_markdown_multiple_code_blocks() -> None: + content = ( + "First\n" + "```js\n" + "one();\n" + "```\n" + "Middle paragraph here\n" + "```py\n" + "two()\n" + "three()\n" + "```\n" + "End" + ) + + chunks = _split_telegram_markdown(content, max_len=55) + + assert chunks[0].endswith("Middle paragraph here\n") + assert chunks[1].startswith("```py\n") + _assert_code_blocks_render_balanced(chunks) + + +def test_split_telegram_markdown_leading_whitespace_before_fence() -> None: + content = "\n```python\n" + ("print('line one')\n" * 6) + "```\nDone" + + chunks = _split_telegram_markdown(content, max_len=60) + + assert chunks + assert all(chunk.strip() for chunk in chunks) + assert chunks[0].startswith("```python\n") + _assert_code_blocks_render_balanced(chunks) + + @pytest.mark.asyncio async def test_start_creates_separate_pools_with_proxy(monkeypatch) -> None: _FakeHTTPXRequest.clear() diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index 1823c9b34..9dd133d84 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -5,56 +5,3 @@ def test_split_message_no_code_blocks_unchanged(): content = "alpha beta gamma delta" assert split_message(content, max_len=12) == ["alpha beta", "gamma delta"] - - -def test_split_message_outside_code_block_unchanged(): - content = "alpha beta gamma delta\n```python\nx = 1\n```\ndone" - - chunks = split_message(content, max_len=12) - - assert chunks[0] == "alpha beta" - assert chunks[1].startswith("gamma") - - -def test_split_message_inside_code_block_moves_before_fence(): - content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone" - - chunks = split_message(content, max_len=35) - - assert chunks[0] == "Intro paragraph.\n" - assert chunks[1].startswith("```python\nprint('a')") - assert all(chunk.count("```") % 2 == 0 for chunk in chunks[1:]) - - -def test_split_message_code_block_longer_than_max_len_closes_and_reopens(): - content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone" - - chunks = split_message(content, max_len=60) - - assert len(chunks) > 1 - assert all(len(chunk) <= 60 for chunk in chunks) - assert all(chunk.count("```") % 2 == 0 for chunk in chunks) - assert chunks[0].startswith("```python\n") - assert chunks[0].endswith("\n```") - assert chunks[1].startswith("```python\n") - - -def test_split_message_multiple_code_blocks_moves_second_block_to_next_chunk(): - content = ( - "First\n" - "```js\n" - "one();\n" - "```\n" - "Middle paragraph here\n" - "```py\n" - "two()\n" - "three()\n" - "```\n" - "End" - ) - - chunks = split_message(content, max_len=55) - - assert chunks[0].endswith("Middle paragraph here\n") - assert chunks[1].startswith("```py\n") - assert all(chunk.count("```") % 2 == 0 for chunk in chunks)