From a5a816abaf10b736c664a6b3bc2b282b0fc58175 Mon Sep 17 00:00:00 2001 From: axelray-dev <110029405+axelray-dev@users.noreply.github.com> Date: Tue, 9 Jun 2026 14:37:14 +0800 Subject: [PATCH] fix(telegram): move fenced-code-block splitting into Telegram-specific helper Move the fenced-code-block-aware splitting logic out of the shared split_message helper (used by Signal, Slack, Discord, Weixin, etc.) and into a Telegram-specific _split_telegram_markdown function. The shared split_message remains a plain-text chunker. The Telegram channel now uses _split_telegram_markdown for its raw Markdown paths that feed _markdown_to_telegram_html, preventing broken HTML rendering when splits fall inside fenced code blocks. Also fixes a regression where content beginning with whitespace before a fence could emit a whitespace-only chunk. Addresses review feedback on #4257. --- nanobot/channels/telegram.py | 77 ++++++++++++++++++++++++- nanobot/utils/helpers.py | 46 --------------- tests/channels/test_telegram_channel.py | 63 ++++++++++++++++++++ tests/utils/test_helpers.py | 53 ----------------- 4 files changed, 138 insertions(+), 101 deletions(-) diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py index 9a9ec9bbd..9d3eafed1 100644 --- a/nanobot/channels/telegram.py +++ b/nanobot/channels/telegram.py @@ -43,6 +43,79 @@ TELEGRAM_HTML_MAX_LEN = 4096 TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN # Max length for reply context in user message +def _split_telegram_markdown(content: str, max_len: int) -> list[str]: + """Split raw Telegram Markdown without leaving fenced code blocks unbalanced.""" + if not content: + return [] + content = content.lstrip() + if not content: + return [] + if len(content) <= max_len: + return [content] + + def fence_line(fence_pos: int) -> str: + line_end = content.find("\n", fence_pos) + if line_end < 0: + return content[fence_pos:] + return content[fence_pos:line_end] + + def split_inside_fenced_code_block(pos: int) -> tuple[bool, int, str]: + if content[:pos].count("```") % 2 == 0: + return False, -1, "" + opening = content.rfind("```", 0, pos) + if opening < 0: + return True, -1, "```" + return True, opening, fence_line(opening) + + chunks: list[str] = [] + while content: + if len(content) <= max_len: + chunks.append(content) + break + + cut = content[:max_len] + pos = cut.rfind("\n") + if pos <= 0: + pos = cut.rfind(" ") + if pos <= 0: + pos = max_len + + inside_code, opening, fence = split_inside_fenced_code_block(pos) + if inside_code: + if opening > 0: + pos = opening + else: + closing = "\n```" + min_code_pos = len(fence) + if content.startswith(fence + "\n"): + min_code_pos += 1 + if pos < min_code_pos and min_code_pos + len(closing) > max_len: + chunks.append(content[:max_len]) + content = content[max_len:].lstrip() + continue + if pos + len(closing) > max_len: + budget = max_len - len(closing) + if budget > 0: + recut = content[:budget] + adjusted = recut.rfind("\n") + if adjusted <= 0: + adjusted = recut.rfind(" ") + pos = adjusted if adjusted > 0 else budget + else: + closing = "```" + pos = max_len - len(closing) + chunks.append(content[:pos] + closing) + remainder = content[pos:] + if remainder.startswith("\n"): + remainder = remainder[1:] + content = f"{fence}\n{remainder}" + continue + + chunks.append(content[:pos]) + content = content[pos:].lstrip() + return chunks + + def _escape_telegram_html(text: str) -> str: """Escape text for Telegram HTML parse mode.""" return text.replace("&", "&").replace("<", "<").replace(">", ">") @@ -632,7 +705,7 @@ class TelegramChannel(BaseChannel): # Fallback: no native keyboard → splice labels into the message so the choices survive. if buttons and reply_markup is None: text = f"{text}\n\n{self._buttons_as_text(buttons)}" - chunks = split_message(text, TELEGRAM_MAX_MESSAGE_LEN) + chunks = _split_telegram_markdown(text, TELEGRAM_MAX_MESSAGE_LEN) for i, chunk in enumerate(chunks): is_last = (i == len(chunks) - 1) await self._send_text( @@ -838,7 +911,7 @@ class TelegramChannel(BaseChannel): intermediate chunks as standalone messages, then opens a new message for the tail so subsequent deltas continue streaming into it. """ - chunks = split_message(buf.text, TELEGRAM_MAX_MESSAGE_LEN) + chunks = _split_telegram_markdown(buf.text, TELEGRAM_MAX_MESSAGE_LEN) if len(chunks) <= 1: return try: diff --git a/nanobot/utils/helpers.py b/nanobot/utils/helpers.py index 181cea9ca..6341bc2bc 100644 --- a/nanobot/utils/helpers.py +++ b/nanobot/utils/helpers.py @@ -368,22 +368,6 @@ def maybe_persist_tool_result( ) -def _fence_line(content: str, fence_pos: int) -> str: - line_end = content.find("\n", fence_pos) - if line_end < 0: - return content[fence_pos:] - return content[fence_pos:line_end] - - -def _split_inside_fenced_code_block(content: str, pos: int) -> tuple[bool, int, str]: - if content[:pos].count("```") % 2 == 0: - return False, -1, "" - opening = content.rfind("```", 0, pos) - if opening < 0: - return True, -1, "```" - return True, opening, _fence_line(content, opening) - - def split_message(content: str, max_len: int = 2000) -> list[str]: """ Split content into chunks within max_len, preferring line breaks. @@ -411,36 +395,6 @@ def split_message(content: str, max_len: int = 2000) -> list[str]: pos = cut.rfind(" ") if pos <= 0: pos = max_len - inside_code, opening, fence = _split_inside_fenced_code_block(content, pos) - if inside_code: - if opening > 0: - pos = opening - else: - closing = "\n```" - min_code_pos = len(fence) - if content.startswith(fence + "\n"): - min_code_pos += 1 - if pos < min_code_pos and min_code_pos + len(closing) > max_len: - chunks.append(content[:max_len]) - content = content[max_len:].lstrip() - continue - if pos + len(closing) > max_len: - budget = max_len - len(closing) - if budget > 0: - recut = content[:budget] - adjusted = recut.rfind("\n") - if adjusted <= 0: - adjusted = recut.rfind(" ") - pos = adjusted if adjusted > 0 else budget - else: - closing = "```" - pos = max_len - len(closing) - chunks.append(content[:pos] + closing) - remainder = content[pos:] - if remainder.startswith("\n"): - remainder = remainder[1:] - content = f"{fence}\n{remainder}" - continue chunks.append(content[:pos]) content = content[pos:].lstrip() return chunks diff --git a/tests/channels/test_telegram_channel.py b/tests/channels/test_telegram_channel.py index 9b66d58be..5115791d9 100644 --- a/tests/channels/test_telegram_channel.py +++ b/tests/channels/test_telegram_channel.py @@ -17,6 +17,8 @@ from nanobot.channels.telegram import ( TELEGRAM_REPLY_CONTEXT_MAX_LEN, TelegramChannel, TelegramConfig, + _markdown_to_telegram_html, + _split_telegram_markdown, _StreamBuf, ) @@ -179,6 +181,67 @@ def _make_telegram_update( return SimpleNamespace(message=message, effective_user=user) +def _assert_code_blocks_render_balanced(chunks: list[str]) -> None: + for chunk in chunks: + html = _markdown_to_telegram_html(chunk) + assert html.count("
") == html.count("")
+
+
+def test_split_telegram_markdown_inside_code_block_moves_before_fence() -> None:
+ content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone"
+
+ chunks = _split_telegram_markdown(content, max_len=35)
+
+ assert chunks[0] == "Intro paragraph.\n"
+ assert chunks[1].startswith("```python\nprint('a')")
+ _assert_code_blocks_render_balanced(chunks)
+
+
+def test_split_telegram_markdown_long_code_block_closes_and_reopens() -> None:
+ content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone"
+
+ chunks = _split_telegram_markdown(content, max_len=60)
+
+ assert len(chunks) > 1
+ assert all(len(chunk) <= 60 for chunk in chunks)
+ assert chunks[0].startswith("```python\n")
+ assert chunks[0].endswith("\n```")
+ assert chunks[1].startswith("```python\n")
+ _assert_code_blocks_render_balanced(chunks)
+
+
+def test_split_telegram_markdown_multiple_code_blocks() -> None:
+ content = (
+ "First\n"
+ "```js\n"
+ "one();\n"
+ "```\n"
+ "Middle paragraph here\n"
+ "```py\n"
+ "two()\n"
+ "three()\n"
+ "```\n"
+ "End"
+ )
+
+ chunks = _split_telegram_markdown(content, max_len=55)
+
+ assert chunks[0].endswith("Middle paragraph here\n")
+ assert chunks[1].startswith("```py\n")
+ _assert_code_blocks_render_balanced(chunks)
+
+
+def test_split_telegram_markdown_leading_whitespace_before_fence() -> None:
+ content = "\n```python\n" + ("print('line one')\n" * 6) + "```\nDone"
+
+ chunks = _split_telegram_markdown(content, max_len=60)
+
+ assert chunks
+ assert all(chunk.strip() for chunk in chunks)
+ assert chunks[0].startswith("```python\n")
+ _assert_code_blocks_render_balanced(chunks)
+
+
@pytest.mark.asyncio
async def test_start_creates_separate_pools_with_proxy(monkeypatch) -> None:
_FakeHTTPXRequest.clear()
diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py
index 1823c9b34..9dd133d84 100644
--- a/tests/utils/test_helpers.py
+++ b/tests/utils/test_helpers.py
@@ -5,56 +5,3 @@ def test_split_message_no_code_blocks_unchanged():
content = "alpha beta gamma delta"
assert split_message(content, max_len=12) == ["alpha beta", "gamma delta"]
-
-
-def test_split_message_outside_code_block_unchanged():
- content = "alpha beta gamma delta\n```python\nx = 1\n```\ndone"
-
- chunks = split_message(content, max_len=12)
-
- assert chunks[0] == "alpha beta"
- assert chunks[1].startswith("gamma")
-
-
-def test_split_message_inside_code_block_moves_before_fence():
- content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone"
-
- chunks = split_message(content, max_len=35)
-
- assert chunks[0] == "Intro paragraph.\n"
- assert chunks[1].startswith("```python\nprint('a')")
- assert all(chunk.count("```") % 2 == 0 for chunk in chunks[1:])
-
-
-def test_split_message_code_block_longer_than_max_len_closes_and_reopens():
- content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone"
-
- chunks = split_message(content, max_len=60)
-
- assert len(chunks) > 1
- assert all(len(chunk) <= 60 for chunk in chunks)
- assert all(chunk.count("```") % 2 == 0 for chunk in chunks)
- assert chunks[0].startswith("```python\n")
- assert chunks[0].endswith("\n```")
- assert chunks[1].startswith("```python\n")
-
-
-def test_split_message_multiple_code_blocks_moves_second_block_to_next_chunk():
- content = (
- "First\n"
- "```js\n"
- "one();\n"
- "```\n"
- "Middle paragraph here\n"
- "```py\n"
- "two()\n"
- "three()\n"
- "```\n"
- "End"
- )
-
- chunks = split_message(content, max_len=55)
-
- assert chunks[0].endswith("Middle paragraph here\n")
- assert chunks[1].startswith("```py\n")
- assert all(chunk.count("```") % 2 == 0 for chunk in chunks)