From a5a816abaf10b736c664a6b3bc2b282b0fc58175 Mon Sep 17 00:00:00 2001
From: axelray-dev <110029405+axelray-dev@users.noreply.github.com>
Date: Tue, 9 Jun 2026 14:37:14 +0800
Subject: [PATCH] fix(telegram): move fenced-code-block splitting into
 Telegram-specific helper

Move the fenced-code-block-aware splitting logic out of the shared
split_message helper (used by Signal, Slack, Discord, Weixin, etc.)
and into a Telegram-specific _split_telegram_markdown function.

The shared split_message remains a plain-text chunker. The Telegram
channel now uses _split_telegram_markdown for its raw Markdown paths
that feed _markdown_to_telegram_html, preventing broken HTML rendering
when splits fall inside fenced code blocks.

Also fixes a regression where content beginning with whitespace before
a fence could emit a whitespace-only chunk.

Addresses review feedback on #4257.
---
 nanobot/channels/telegram.py            | 77 ++++++++++++++++++++++++-
 nanobot/utils/helpers.py                | 46 ---------------
 tests/channels/test_telegram_channel.py | 63 ++++++++++++++++++++
 tests/utils/test_helpers.py             | 53 -----------------
 4 files changed, 138 insertions(+), 101 deletions(-)

diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py
index 9a9ec9bbd..9d3eafed1 100644
--- a/nanobot/channels/telegram.py
+++ b/nanobot/channels/telegram.py
@@ -43,6 +43,79 @@ TELEGRAM_HTML_MAX_LEN = 4096
 TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN  # Max length for reply context in user message
 
 
+def _split_telegram_markdown(content: str, max_len: int) -> list[str]:
+    """Split raw Telegram Markdown without leaving fenced code blocks unbalanced."""
+    if not content:
+        return []
+    content = content.lstrip()
+    if not content:
+        return []
+    if len(content) <= max_len:
+        return [content]
+
+    def fence_line(fence_pos: int) -> str:
+        line_end = content.find("\n", fence_pos)
+        if line_end < 0:
+            return content[fence_pos:]
+        return content[fence_pos:line_end]
+
+    def split_inside_fenced_code_block(pos: int) -> tuple[bool, int, str]:
+        if content[:pos].count("```") % 2 == 0:
+            return False, -1, ""
+        opening = content.rfind("```", 0, pos)
+        if opening < 0:
+            return True, -1, "```"
+        return True, opening, fence_line(opening)
+
+    chunks: list[str] = []
+    while content:
+        if len(content) <= max_len:
+            chunks.append(content)
+            break
+
+        cut = content[:max_len]
+        pos = cut.rfind("\n")
+        if pos <= 0:
+            pos = cut.rfind(" ")
+        if pos <= 0:
+            pos = max_len
+
+        inside_code, opening, fence = split_inside_fenced_code_block(pos)
+        if inside_code:
+            if opening > 0:
+                pos = opening
+            else:
+                closing = "\n```"
+                min_code_pos = len(fence)
+                if content.startswith(fence + "\n"):
+                    min_code_pos += 1
+                if pos < min_code_pos and min_code_pos + len(closing) > max_len:
+                    chunks.append(content[:max_len])
+                    content = content[max_len:].lstrip()
+                    continue
+                if pos + len(closing) > max_len:
+                    budget = max_len - len(closing)
+                    if budget > 0:
+                        recut = content[:budget]
+                        adjusted = recut.rfind("\n")
+                        if adjusted <= 0:
+                            adjusted = recut.rfind(" ")
+                        pos = adjusted if adjusted > 0 else budget
+                    else:
+                        closing = "```"
+                        pos = max_len - len(closing)
+                chunks.append(content[:pos] + closing)
+                remainder = content[pos:]
+                if remainder.startswith("\n"):
+                    remainder = remainder[1:]
+                content = f"{fence}\n{remainder}"
+                continue
+
+        chunks.append(content[:pos])
+        content = content[pos:].lstrip()
+    return chunks
+
+
 def _escape_telegram_html(text: str) -> str:
     """Escape text for Telegram HTML parse mode."""
     return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
@@ -632,7 +705,7 @@ class TelegramChannel(BaseChannel):
             # Fallback: no native keyboard → splice labels into the message so the choices survive.
             if buttons and reply_markup is None:
                 text = f"{text}\n\n{self._buttons_as_text(buttons)}"
-            chunks = split_message(text, TELEGRAM_MAX_MESSAGE_LEN)
+            chunks = _split_telegram_markdown(text, TELEGRAM_MAX_MESSAGE_LEN)
             for i, chunk in enumerate(chunks):
                 is_last = (i == len(chunks) - 1)
                 await self._send_text(
@@ -838,7 +911,7 @@ class TelegramChannel(BaseChannel):
         intermediate chunks as standalone messages, then opens a new message
         for the tail so subsequent deltas continue streaming into it.
         """
-        chunks = split_message(buf.text, TELEGRAM_MAX_MESSAGE_LEN)
+        chunks = _split_telegram_markdown(buf.text, TELEGRAM_MAX_MESSAGE_LEN)
         if len(chunks) <= 1:
             return
         try:
diff --git a/nanobot/utils/helpers.py b/nanobot/utils/helpers.py
index 181cea9ca..6341bc2bc 100644
--- a/nanobot/utils/helpers.py
+++ b/nanobot/utils/helpers.py
@@ -368,22 +368,6 @@ def maybe_persist_tool_result(
     )
 
 
-def _fence_line(content: str, fence_pos: int) -> str:
-    line_end = content.find("\n", fence_pos)
-    if line_end < 0:
-        return content[fence_pos:]
-    return content[fence_pos:line_end]
-
-
-def _split_inside_fenced_code_block(content: str, pos: int) -> tuple[bool, int, str]:
-    if content[:pos].count("```") % 2 == 0:
-        return False, -1, ""
-    opening = content.rfind("```", 0, pos)
-    if opening < 0:
-        return True, -1, "```"
-    return True, opening, _fence_line(content, opening)
-
-
 def split_message(content: str, max_len: int = 2000) -> list[str]:
     """
     Split content into chunks within max_len, preferring line breaks.
@@ -411,36 +395,6 @@ def split_message(content: str, max_len: int = 2000) -> list[str]:
             pos = cut.rfind(" ")
         if pos <= 0:
             pos = max_len
-        inside_code, opening, fence = _split_inside_fenced_code_block(content, pos)
-        if inside_code:
-            if opening > 0:
-                pos = opening
-            else:
-                closing = "\n```"
-                min_code_pos = len(fence)
-                if content.startswith(fence + "\n"):
-                    min_code_pos += 1
-                if pos < min_code_pos and min_code_pos + len(closing) > max_len:
-                    chunks.append(content[:max_len])
-                    content = content[max_len:].lstrip()
-                    continue
-                if pos + len(closing) > max_len:
-                    budget = max_len - len(closing)
-                    if budget > 0:
-                        recut = content[:budget]
-                        adjusted = recut.rfind("\n")
-                        if adjusted <= 0:
-                            adjusted = recut.rfind(" ")
-                        pos = adjusted if adjusted > 0 else budget
-                    else:
-                        closing = "```"
-                        pos = max_len - len(closing)
-                chunks.append(content[:pos] + closing)
-                remainder = content[pos:]
-                if remainder.startswith("\n"):
-                    remainder = remainder[1:]
-                content = f"{fence}\n{remainder}"
-                continue
         chunks.append(content[:pos])
         content = content[pos:].lstrip()
     return chunks
diff --git a/tests/channels/test_telegram_channel.py b/tests/channels/test_telegram_channel.py
index 9b66d58be..5115791d9 100644
--- a/tests/channels/test_telegram_channel.py
+++ b/tests/channels/test_telegram_channel.py
@@ -17,6 +17,8 @@ from nanobot.channels.telegram import (
     TELEGRAM_REPLY_CONTEXT_MAX_LEN,
     TelegramChannel,
     TelegramConfig,
+    _markdown_to_telegram_html,
+    _split_telegram_markdown,
     _StreamBuf,
 )
 
@@ -179,6 +181,67 @@ def _make_telegram_update(
     return SimpleNamespace(message=message, effective_user=user)
 
 
+def _assert_code_blocks_render_balanced(chunks: list[str]) -> None:
+    for chunk in chunks:
+        html = _markdown_to_telegram_html(chunk)
+        assert html.count("<pre><code>") == html.count("</code></pre>")
+
+
+def test_split_telegram_markdown_inside_code_block_moves_before_fence() -> None:
+    content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone"
+
+    chunks = _split_telegram_markdown(content, max_len=35)
+
+    assert chunks[0] == "Intro paragraph.\n"
+    assert chunks[1].startswith("```python\nprint('a')")
+    _assert_code_blocks_render_balanced(chunks)
+
+
+def test_split_telegram_markdown_long_code_block_closes_and_reopens() -> None:
+    content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone"
+
+    chunks = _split_telegram_markdown(content, max_len=60)
+
+    assert len(chunks) > 1
+    assert all(len(chunk) <= 60 for chunk in chunks)
+    assert chunks[0].startswith("```python\n")
+    assert chunks[0].endswith("\n```")
+    assert chunks[1].startswith("```python\n")
+    _assert_code_blocks_render_balanced(chunks)
+
+
+def test_split_telegram_markdown_multiple_code_blocks() -> None:
+    content = (
+        "First\n"
+        "```js\n"
+        "one();\n"
+        "```\n"
+        "Middle paragraph here\n"
+        "```py\n"
+        "two()\n"
+        "three()\n"
+        "```\n"
+        "End"
+    )
+
+    chunks = _split_telegram_markdown(content, max_len=55)
+
+    assert chunks[0].endswith("Middle paragraph here\n")
+    assert chunks[1].startswith("```py\n")
+    _assert_code_blocks_render_balanced(chunks)
+
+
+def test_split_telegram_markdown_leading_whitespace_before_fence() -> None:
+    content = "\n```python\n" + ("print('line one')\n" * 6) + "```\nDone"
+
+    chunks = _split_telegram_markdown(content, max_len=60)
+
+    assert chunks
+    assert all(chunk.strip() for chunk in chunks)
+    assert chunks[0].startswith("```python\n")
+    _assert_code_blocks_render_balanced(chunks)
+
+
 @pytest.mark.asyncio
 async def test_start_creates_separate_pools_with_proxy(monkeypatch) -> None:
     _FakeHTTPXRequest.clear()
diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py
index 1823c9b34..9dd133d84 100644
--- a/tests/utils/test_helpers.py
+++ b/tests/utils/test_helpers.py
@@ -5,56 +5,3 @@ def test_split_message_no_code_blocks_unchanged():
     content = "alpha beta gamma delta"
 
     assert split_message(content, max_len=12) == ["alpha beta", "gamma delta"]
-
-
-def test_split_message_outside_code_block_unchanged():
-    content = "alpha beta gamma delta\n```python\nx = 1\n```\ndone"
-
-    chunks = split_message(content, max_len=12)
-
-    assert chunks[0] == "alpha beta"
-    assert chunks[1].startswith("gamma")
-
-
-def test_split_message_inside_code_block_moves_before_fence():
-    content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone"
-
-    chunks = split_message(content, max_len=35)
-
-    assert chunks[0] == "Intro paragraph.\n"
-    assert chunks[1].startswith("```python\nprint('a')")
-    assert all(chunk.count("```") % 2 == 0 for chunk in chunks[1:])
-
-
-def test_split_message_code_block_longer_than_max_len_closes_and_reopens():
-    content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone"
-
-    chunks = split_message(content, max_len=60)
-
-    assert len(chunks) > 1
-    assert all(len(chunk) <= 60 for chunk in chunks)
-    assert all(chunk.count("```") % 2 == 0 for chunk in chunks)
-    assert chunks[0].startswith("```python\n")
-    assert chunks[0].endswith("\n```")
-    assert chunks[1].startswith("```python\n")
-
-
-def test_split_message_multiple_code_blocks_moves_second_block_to_next_chunk():
-    content = (
-        "First\n"
-        "```js\n"
-        "one();\n"
-        "```\n"
-        "Middle paragraph here\n"
-        "```py\n"
-        "two()\n"
-        "three()\n"
-        "```\n"
-        "End"
-    )
-
-    chunks = split_message(content, max_len=55)
-
-    assert chunks[0].endswith("Middle paragraph here\n")
-    assert chunks[1].startswith("```py\n")
-    assert all(chunk.count("```") % 2 == 0 for chunk in chunks)