mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-06-15 15:24:06 +00:00
fix(telegram): move fenced-code-block splitting into Telegram-specific helper
Move the fenced-code-block-aware splitting logic out of the shared split_message helper (used by Signal, Slack, Discord, Weixin, etc.) and into a Telegram-specific _split_telegram_markdown function. The shared split_message remains a plain-text chunker. The Telegram channel now uses _split_telegram_markdown for its raw Markdown paths that feed _markdown_to_telegram_html, preventing broken HTML rendering when splits fall inside fenced code blocks. Also fixes a regression where content beginning with whitespace before a fence could emit a whitespace-only chunk. Addresses review feedback on #4257.
This commit is contained in:
parent
131446fa61
commit
a5a816abaf
@ -43,6 +43,79 @@ TELEGRAM_HTML_MAX_LEN = 4096
|
|||||||
TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN # Max length for reply context in user message
|
TELEGRAM_REPLY_CONTEXT_MAX_LEN = TELEGRAM_MAX_MESSAGE_LEN # Max length for reply context in user message
|
||||||
|
|
||||||
|
|
||||||
|
def _split_telegram_markdown(content: str, max_len: int) -> list[str]:
|
||||||
|
"""Split raw Telegram Markdown without leaving fenced code blocks unbalanced."""
|
||||||
|
if not content:
|
||||||
|
return []
|
||||||
|
content = content.lstrip()
|
||||||
|
if not content:
|
||||||
|
return []
|
||||||
|
if len(content) <= max_len:
|
||||||
|
return [content]
|
||||||
|
|
||||||
|
def fence_line(fence_pos: int) -> str:
|
||||||
|
line_end = content.find("\n", fence_pos)
|
||||||
|
if line_end < 0:
|
||||||
|
return content[fence_pos:]
|
||||||
|
return content[fence_pos:line_end]
|
||||||
|
|
||||||
|
def split_inside_fenced_code_block(pos: int) -> tuple[bool, int, str]:
|
||||||
|
if content[:pos].count("```") % 2 == 0:
|
||||||
|
return False, -1, ""
|
||||||
|
opening = content.rfind("```", 0, pos)
|
||||||
|
if opening < 0:
|
||||||
|
return True, -1, "```"
|
||||||
|
return True, opening, fence_line(opening)
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
while content:
|
||||||
|
if len(content) <= max_len:
|
||||||
|
chunks.append(content)
|
||||||
|
break
|
||||||
|
|
||||||
|
cut = content[:max_len]
|
||||||
|
pos = cut.rfind("\n")
|
||||||
|
if pos <= 0:
|
||||||
|
pos = cut.rfind(" ")
|
||||||
|
if pos <= 0:
|
||||||
|
pos = max_len
|
||||||
|
|
||||||
|
inside_code, opening, fence = split_inside_fenced_code_block(pos)
|
||||||
|
if inside_code:
|
||||||
|
if opening > 0:
|
||||||
|
pos = opening
|
||||||
|
else:
|
||||||
|
closing = "\n```"
|
||||||
|
min_code_pos = len(fence)
|
||||||
|
if content.startswith(fence + "\n"):
|
||||||
|
min_code_pos += 1
|
||||||
|
if pos < min_code_pos and min_code_pos + len(closing) > max_len:
|
||||||
|
chunks.append(content[:max_len])
|
||||||
|
content = content[max_len:].lstrip()
|
||||||
|
continue
|
||||||
|
if pos + len(closing) > max_len:
|
||||||
|
budget = max_len - len(closing)
|
||||||
|
if budget > 0:
|
||||||
|
recut = content[:budget]
|
||||||
|
adjusted = recut.rfind("\n")
|
||||||
|
if adjusted <= 0:
|
||||||
|
adjusted = recut.rfind(" ")
|
||||||
|
pos = adjusted if adjusted > 0 else budget
|
||||||
|
else:
|
||||||
|
closing = "```"
|
||||||
|
pos = max_len - len(closing)
|
||||||
|
chunks.append(content[:pos] + closing)
|
||||||
|
remainder = content[pos:]
|
||||||
|
if remainder.startswith("\n"):
|
||||||
|
remainder = remainder[1:]
|
||||||
|
content = f"{fence}\n{remainder}"
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunks.append(content[:pos])
|
||||||
|
content = content[pos:].lstrip()
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def _escape_telegram_html(text: str) -> str:
|
def _escape_telegram_html(text: str) -> str:
|
||||||
"""Escape text for Telegram HTML parse mode."""
|
"""Escape text for Telegram HTML parse mode."""
|
||||||
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
||||||
@ -632,7 +705,7 @@ class TelegramChannel(BaseChannel):
|
|||||||
# Fallback: no native keyboard → splice labels into the message so the choices survive.
|
# Fallback: no native keyboard → splice labels into the message so the choices survive.
|
||||||
if buttons and reply_markup is None:
|
if buttons and reply_markup is None:
|
||||||
text = f"{text}\n\n{self._buttons_as_text(buttons)}"
|
text = f"{text}\n\n{self._buttons_as_text(buttons)}"
|
||||||
chunks = split_message(text, TELEGRAM_MAX_MESSAGE_LEN)
|
chunks = _split_telegram_markdown(text, TELEGRAM_MAX_MESSAGE_LEN)
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
is_last = (i == len(chunks) - 1)
|
is_last = (i == len(chunks) - 1)
|
||||||
await self._send_text(
|
await self._send_text(
|
||||||
@ -838,7 +911,7 @@ class TelegramChannel(BaseChannel):
|
|||||||
intermediate chunks as standalone messages, then opens a new message
|
intermediate chunks as standalone messages, then opens a new message
|
||||||
for the tail so subsequent deltas continue streaming into it.
|
for the tail so subsequent deltas continue streaming into it.
|
||||||
"""
|
"""
|
||||||
chunks = split_message(buf.text, TELEGRAM_MAX_MESSAGE_LEN)
|
chunks = _split_telegram_markdown(buf.text, TELEGRAM_MAX_MESSAGE_LEN)
|
||||||
if len(chunks) <= 1:
|
if len(chunks) <= 1:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -368,22 +368,6 @@ def maybe_persist_tool_result(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _fence_line(content: str, fence_pos: int) -> str:
|
|
||||||
line_end = content.find("\n", fence_pos)
|
|
||||||
if line_end < 0:
|
|
||||||
return content[fence_pos:]
|
|
||||||
return content[fence_pos:line_end]
|
|
||||||
|
|
||||||
|
|
||||||
def _split_inside_fenced_code_block(content: str, pos: int) -> tuple[bool, int, str]:
|
|
||||||
if content[:pos].count("```") % 2 == 0:
|
|
||||||
return False, -1, ""
|
|
||||||
opening = content.rfind("```", 0, pos)
|
|
||||||
if opening < 0:
|
|
||||||
return True, -1, "```"
|
|
||||||
return True, opening, _fence_line(content, opening)
|
|
||||||
|
|
||||||
|
|
||||||
def split_message(content: str, max_len: int = 2000) -> list[str]:
|
def split_message(content: str, max_len: int = 2000) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Split content into chunks within max_len, preferring line breaks.
|
Split content into chunks within max_len, preferring line breaks.
|
||||||
@ -411,36 +395,6 @@ def split_message(content: str, max_len: int = 2000) -> list[str]:
|
|||||||
pos = cut.rfind(" ")
|
pos = cut.rfind(" ")
|
||||||
if pos <= 0:
|
if pos <= 0:
|
||||||
pos = max_len
|
pos = max_len
|
||||||
inside_code, opening, fence = _split_inside_fenced_code_block(content, pos)
|
|
||||||
if inside_code:
|
|
||||||
if opening > 0:
|
|
||||||
pos = opening
|
|
||||||
else:
|
|
||||||
closing = "\n```"
|
|
||||||
min_code_pos = len(fence)
|
|
||||||
if content.startswith(fence + "\n"):
|
|
||||||
min_code_pos += 1
|
|
||||||
if pos < min_code_pos and min_code_pos + len(closing) > max_len:
|
|
||||||
chunks.append(content[:max_len])
|
|
||||||
content = content[max_len:].lstrip()
|
|
||||||
continue
|
|
||||||
if pos + len(closing) > max_len:
|
|
||||||
budget = max_len - len(closing)
|
|
||||||
if budget > 0:
|
|
||||||
recut = content[:budget]
|
|
||||||
adjusted = recut.rfind("\n")
|
|
||||||
if adjusted <= 0:
|
|
||||||
adjusted = recut.rfind(" ")
|
|
||||||
pos = adjusted if adjusted > 0 else budget
|
|
||||||
else:
|
|
||||||
closing = "```"
|
|
||||||
pos = max_len - len(closing)
|
|
||||||
chunks.append(content[:pos] + closing)
|
|
||||||
remainder = content[pos:]
|
|
||||||
if remainder.startswith("\n"):
|
|
||||||
remainder = remainder[1:]
|
|
||||||
content = f"{fence}\n{remainder}"
|
|
||||||
continue
|
|
||||||
chunks.append(content[:pos])
|
chunks.append(content[:pos])
|
||||||
content = content[pos:].lstrip()
|
content = content[pos:].lstrip()
|
||||||
return chunks
|
return chunks
|
||||||
|
|||||||
@ -17,6 +17,8 @@ from nanobot.channels.telegram import (
|
|||||||
TELEGRAM_REPLY_CONTEXT_MAX_LEN,
|
TELEGRAM_REPLY_CONTEXT_MAX_LEN,
|
||||||
TelegramChannel,
|
TelegramChannel,
|
||||||
TelegramConfig,
|
TelegramConfig,
|
||||||
|
_markdown_to_telegram_html,
|
||||||
|
_split_telegram_markdown,
|
||||||
_StreamBuf,
|
_StreamBuf,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -179,6 +181,67 @@ def _make_telegram_update(
|
|||||||
return SimpleNamespace(message=message, effective_user=user)
|
return SimpleNamespace(message=message, effective_user=user)
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_code_blocks_render_balanced(chunks: list[str]) -> None:
|
||||||
|
for chunk in chunks:
|
||||||
|
html = _markdown_to_telegram_html(chunk)
|
||||||
|
assert html.count("<pre><code>") == html.count("</code></pre>")
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_telegram_markdown_inside_code_block_moves_before_fence() -> None:
|
||||||
|
content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone"
|
||||||
|
|
||||||
|
chunks = _split_telegram_markdown(content, max_len=35)
|
||||||
|
|
||||||
|
assert chunks[0] == "Intro paragraph.\n"
|
||||||
|
assert chunks[1].startswith("```python\nprint('a')")
|
||||||
|
_assert_code_blocks_render_balanced(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_telegram_markdown_long_code_block_closes_and_reopens() -> None:
|
||||||
|
content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone"
|
||||||
|
|
||||||
|
chunks = _split_telegram_markdown(content, max_len=60)
|
||||||
|
|
||||||
|
assert len(chunks) > 1
|
||||||
|
assert all(len(chunk) <= 60 for chunk in chunks)
|
||||||
|
assert chunks[0].startswith("```python\n")
|
||||||
|
assert chunks[0].endswith("\n```")
|
||||||
|
assert chunks[1].startswith("```python\n")
|
||||||
|
_assert_code_blocks_render_balanced(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_telegram_markdown_multiple_code_blocks() -> None:
|
||||||
|
content = (
|
||||||
|
"First\n"
|
||||||
|
"```js\n"
|
||||||
|
"one();\n"
|
||||||
|
"```\n"
|
||||||
|
"Middle paragraph here\n"
|
||||||
|
"```py\n"
|
||||||
|
"two()\n"
|
||||||
|
"three()\n"
|
||||||
|
"```\n"
|
||||||
|
"End"
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = _split_telegram_markdown(content, max_len=55)
|
||||||
|
|
||||||
|
assert chunks[0].endswith("Middle paragraph here\n")
|
||||||
|
assert chunks[1].startswith("```py\n")
|
||||||
|
_assert_code_blocks_render_balanced(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_telegram_markdown_leading_whitespace_before_fence() -> None:
|
||||||
|
content = "\n```python\n" + ("print('line one')\n" * 6) + "```\nDone"
|
||||||
|
|
||||||
|
chunks = _split_telegram_markdown(content, max_len=60)
|
||||||
|
|
||||||
|
assert chunks
|
||||||
|
assert all(chunk.strip() for chunk in chunks)
|
||||||
|
assert chunks[0].startswith("```python\n")
|
||||||
|
_assert_code_blocks_render_balanced(chunks)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_start_creates_separate_pools_with_proxy(monkeypatch) -> None:
|
async def test_start_creates_separate_pools_with_proxy(monkeypatch) -> None:
|
||||||
_FakeHTTPXRequest.clear()
|
_FakeHTTPXRequest.clear()
|
||||||
|
|||||||
@ -5,56 +5,3 @@ def test_split_message_no_code_blocks_unchanged():
|
|||||||
content = "alpha beta gamma delta"
|
content = "alpha beta gamma delta"
|
||||||
|
|
||||||
assert split_message(content, max_len=12) == ["alpha beta", "gamma delta"]
|
assert split_message(content, max_len=12) == ["alpha beta", "gamma delta"]
|
||||||
|
|
||||||
|
|
||||||
def test_split_message_outside_code_block_unchanged():
|
|
||||||
content = "alpha beta gamma delta\n```python\nx = 1\n```\ndone"
|
|
||||||
|
|
||||||
chunks = split_message(content, max_len=12)
|
|
||||||
|
|
||||||
assert chunks[0] == "alpha beta"
|
|
||||||
assert chunks[1].startswith("gamma")
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_message_inside_code_block_moves_before_fence():
|
|
||||||
content = "Intro paragraph.\n```python\nprint('a')\nprint('b')\n```\nDone"
|
|
||||||
|
|
||||||
chunks = split_message(content, max_len=35)
|
|
||||||
|
|
||||||
assert chunks[0] == "Intro paragraph.\n"
|
|
||||||
assert chunks[1].startswith("```python\nprint('a')")
|
|
||||||
assert all(chunk.count("```") % 2 == 0 for chunk in chunks[1:])
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_message_code_block_longer_than_max_len_closes_and_reopens():
|
|
||||||
content = "```python\n" + ("print('line one')\n" * 6) + "```\nDone"
|
|
||||||
|
|
||||||
chunks = split_message(content, max_len=60)
|
|
||||||
|
|
||||||
assert len(chunks) > 1
|
|
||||||
assert all(len(chunk) <= 60 for chunk in chunks)
|
|
||||||
assert all(chunk.count("```") % 2 == 0 for chunk in chunks)
|
|
||||||
assert chunks[0].startswith("```python\n")
|
|
||||||
assert chunks[0].endswith("\n```")
|
|
||||||
assert chunks[1].startswith("```python\n")
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_message_multiple_code_blocks_moves_second_block_to_next_chunk():
|
|
||||||
content = (
|
|
||||||
"First\n"
|
|
||||||
"```js\n"
|
|
||||||
"one();\n"
|
|
||||||
"```\n"
|
|
||||||
"Middle paragraph here\n"
|
|
||||||
"```py\n"
|
|
||||||
"two()\n"
|
|
||||||
"three()\n"
|
|
||||||
"```\n"
|
|
||||||
"End"
|
|
||||||
)
|
|
||||||
|
|
||||||
chunks = split_message(content, max_len=55)
|
|
||||||
|
|
||||||
assert chunks[0].endswith("Middle paragraph here\n")
|
|
||||||
assert chunks[1].startswith("```py\n")
|
|
||||||
assert all(chunk.count("```") % 2 == 0 for chunk in chunks)
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user