mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-08 02:35:59 +00:00
fix(telegram): improve markdown rendering for modern LLM output
Problem: Modern LLMs (GPT-5.4, Claude, Gemini) produce markdown-heavy responses with numbered lists, headers, and nested formatting. The Telegram channel's _markdown_to_telegram_html() converter has gaps that leave these poorly formatted: 1. Numbered lists (1. 2. 3.) have zero handling — sent as raw text 2. Headers (# Title) are stripped to plain text, losing visual hierarchy 3. Mid-stream edits send raw markdown (users see **bold** and ### headers while the response generates, before the final HTML conversion) Root Cause: _markdown_to_telegram_html() handles bullets (- *) but skips numbered lists entirely. Headers are stripped of # but not given any emphasis. The streaming path in send_delta() sends buf.text as-is during mid-stream edits (plain text, no parse_mode) — only the final _stream_end edit converts to HTML. Fix: 1. Headers now render as <b>bold</b> in the final HTML (using placeholder markers that survive HTML escaping, restored after all other processing) 2. Numbered lists are normalized (extra whitespace after the dot is cleaned) 3. New _strip_md_block() function strips markdown syntax for readable plain-text preview during streaming mid-edits The final _stream_end HTML conversion is unchanged — it still produces full HTML with parse_mode=HTML. Only the intermediate edits are improved. Tests: Added 10 new tests covering: - Headers converting to bold HTML - Numbered list preservation and whitespace normalization - Headers with HTML special characters - Mixed formatting (headers + bullets + numbers + bold) - _strip_md_block for inline formatting, headers, bullets, numbers, links - Streaming mid-edit markdown stripping (initial send + edit)
This commit is contained in:
parent
37ea8b8f5b
commit
f8a023218d
@ -53,6 +53,34 @@ def _strip_md(s: str) -> str:
|
|||||||
return s.strip()
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_md_block(text: str) -> str:
|
||||||
|
"""Strip block-level and inline markdown for readable plain-text preview.
|
||||||
|
|
||||||
|
Used during streaming mid-edits so users see clean text instead of raw
|
||||||
|
markdown syntax while the response is still being generated.
|
||||||
|
"""
|
||||||
|
# Code blocks -> just the code
|
||||||
|
text = re.sub(r'```[\w]*\n?([\s\S]*?)```', r'\1', text)
|
||||||
|
# Headers -> plain text
|
||||||
|
text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
|
||||||
|
# Blockquotes
|
||||||
|
text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
|
||||||
|
# Bold / italic / strikethrough
|
||||||
|
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
||||||
|
text = re.sub(r'__(.+?)__', r'\1', text)
|
||||||
|
text = re.sub(r'(?<![a-zA-Z0-9])_([^_]+)_(?![a-zA-Z0-9])', r'\1', text)
|
||||||
|
text = re.sub(r'~~(.+?)~~', r'\1', text)
|
||||||
|
# Inline code
|
||||||
|
text = re.sub(r'`([^`]+)`', r'\1', text)
|
||||||
|
# Links [text](url) -> text
|
||||||
|
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
||||||
|
# Bullet lists
|
||||||
|
text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE)
|
||||||
|
# Numbered lists (normalize spacing)
|
||||||
|
text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def _render_table_box(table_lines: list[str]) -> str:
|
def _render_table_box(table_lines: list[str]) -> str:
|
||||||
"""Convert markdown pipe-table to compact aligned text for <pre> display."""
|
"""Convert markdown pipe-table to compact aligned text for <pre> display."""
|
||||||
|
|
||||||
@ -129,8 +157,8 @@ def _markdown_to_telegram_html(text: str) -> str:
|
|||||||
|
|
||||||
text = re.sub(r'`([^`]+)`', save_inline_code, text)
|
text = re.sub(r'`([^`]+)`', save_inline_code, text)
|
||||||
|
|
||||||
# 3. Headers # Title -> just the title text
|
# 3. Headers # Title -> <b>Title</b> (preserve visual hierarchy)
|
||||||
text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
|
text = re.sub(r'^#{1,6}\s+(.+)$', r'⟪B⟫\1⟪/B⟫', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
# 4. Blockquotes > text -> just the text (before HTML escaping)
|
# 4. Blockquotes > text -> just the text (before HTML escaping)
|
||||||
text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
|
text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
|
||||||
@ -154,6 +182,9 @@ def _markdown_to_telegram_html(text: str) -> str:
|
|||||||
# 10. Bullet lists - item -> • item
|
# 10. Bullet lists - item -> • item
|
||||||
text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE)
|
text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# 10.5. Numbered lists 1. item -> 1. item (keep number, normalize indent)
|
||||||
|
text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
# 11. Restore inline code with HTML tags
|
# 11. Restore inline code with HTML tags
|
||||||
for i, code in enumerate(inline_codes):
|
for i, code in enumerate(inline_codes):
|
||||||
# Escape HTML in code content
|
# Escape HTML in code content
|
||||||
@ -166,6 +197,9 @@ def _markdown_to_telegram_html(text: str) -> str:
|
|||||||
escaped = _escape_telegram_html(code)
|
escaped = _escape_telegram_html(code)
|
||||||
text = text.replace(f"\x00CB{i}\x00", f"<pre><code>{escaped}</code></pre>")
|
text = text.replace(f"\x00CB{i}\x00", f"<pre><code>{escaped}</code></pre>")
|
||||||
|
|
||||||
|
# 13. Restore header bold markers (inserted in step 3, after HTML escaping)
|
||||||
|
text = text.replace('⟪B⟫', '<b>').replace('⟪/B⟫', '</b>')
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -637,10 +671,11 @@ class TelegramChannel(BaseChannel):
|
|||||||
if message_thread_id := meta.get("message_thread_id"):
|
if message_thread_id := meta.get("message_thread_id"):
|
||||||
thread_kwargs["message_thread_id"] = message_thread_id
|
thread_kwargs["message_thread_id"] = message_thread_id
|
||||||
if buf.message_id is None:
|
if buf.message_id is None:
|
||||||
|
preview = _strip_md_block(buf.text)
|
||||||
try:
|
try:
|
||||||
sent = await self._call_with_retry(
|
sent = await self._call_with_retry(
|
||||||
self._app.bot.send_message,
|
self._app.bot.send_message,
|
||||||
chat_id=int_chat_id, text=buf.text,
|
chat_id=int_chat_id, text=preview,
|
||||||
**thread_kwargs,
|
**thread_kwargs,
|
||||||
)
|
)
|
||||||
buf.message_id = sent.message_id
|
buf.message_id = sent.message_id
|
||||||
@ -653,11 +688,12 @@ class TelegramChannel(BaseChannel):
|
|||||||
await self._flush_stream_overflow(int_chat_id, buf, thread_kwargs)
|
await self._flush_stream_overflow(int_chat_id, buf, thread_kwargs)
|
||||||
buf.last_edit = now
|
buf.last_edit = now
|
||||||
return
|
return
|
||||||
|
preview = _strip_md_block(buf.text)
|
||||||
try:
|
try:
|
||||||
await self._call_with_retry(
|
await self._call_with_retry(
|
||||||
self._app.bot.edit_message_text,
|
self._app.bot.edit_message_text,
|
||||||
chat_id=int_chat_id, message_id=buf.message_id,
|
chat_id=int_chat_id, message_id=buf.message_id,
|
||||||
text=buf.text,
|
text=preview,
|
||||||
)
|
)
|
||||||
buf.last_edit = now
|
buf.last_edit = now
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -1471,3 +1471,123 @@ async def test_send_text_bad_request_plain_fallback_exhausted() -> None:
|
|||||||
# so HTML fails after 1 attempt → fallback to plain also fails after 1 attempt.
|
# so HTML fails after 1 attempt → fallback to plain also fails after 1 attempt.
|
||||||
# Before the fix: 2 total. After the fix: still 2 (BadRequest SHOULD fallback).
|
# Before the fix: 2 total. After the fix: still 2 (BadRequest SHOULD fallback).
|
||||||
assert call_count == 2, f"Expected 2 calls (1 HTML + 1 plain), got {call_count}"
|
assert call_count == 2, f"Expected 2 calls (1 HTML + 1 plain), got {call_count}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _markdown_to_telegram_html formatting tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_markdown_to_html_headers_become_bold() -> None:
|
||||||
|
from nanobot.channels.telegram import _markdown_to_telegram_html
|
||||||
|
|
||||||
|
assert _markdown_to_telegram_html("# Title") == "<b>Title</b>"
|
||||||
|
assert _markdown_to_telegram_html("## Subtitle") == "<b>Subtitle</b>"
|
||||||
|
assert _markdown_to_telegram_html("### Deep") == "<b>Deep</b>"
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_to_html_numbered_lists_preserved() -> None:
|
||||||
|
from nanobot.channels.telegram import _markdown_to_telegram_html
|
||||||
|
|
||||||
|
text = "1. First\n2. Second\n3. Third"
|
||||||
|
result = _markdown_to_telegram_html(text)
|
||||||
|
assert "1. First" in result
|
||||||
|
assert "2. Second" in result
|
||||||
|
assert "3. Third" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_to_html_numbered_list_normalizes_whitespace() -> None:
|
||||||
|
from nanobot.channels.telegram import _markdown_to_telegram_html
|
||||||
|
|
||||||
|
# Extra spaces after dot should be normalized
|
||||||
|
text = "1. Lots of space\n2. Two spaces"
|
||||||
|
result = _markdown_to_telegram_html(text)
|
||||||
|
assert "1. Lots of space" in result
|
||||||
|
assert "2. Two spaces" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_to_html_headers_survive_html_escaping() -> None:
|
||||||
|
"""Headers containing special HTML chars should still render as bold."""
|
||||||
|
from nanobot.channels.telegram import _markdown_to_telegram_html
|
||||||
|
|
||||||
|
result = _markdown_to_telegram_html("# A < B & C > D")
|
||||||
|
assert "<b>A < B & C > D</b>" == result
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_to_html_mixed_formatting() -> None:
|
||||||
|
"""Headers, bullets, numbered lists, and bold coexist correctly."""
|
||||||
|
from nanobot.channels.telegram import _markdown_to_telegram_html
|
||||||
|
|
||||||
|
text = "# Overview\n\n- bullet one\n- bullet two\n\n1. step one\n2. step two\n\n**bold text**"
|
||||||
|
result = _markdown_to_telegram_html(text)
|
||||||
|
assert "<b>Overview</b>" in result
|
||||||
|
assert "\u2022 bullet one" in result
|
||||||
|
assert "1. step one" in result
|
||||||
|
assert "<b>bold text</b>" in result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _strip_md_block tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_strip_md_block_removes_inline_formatting() -> None:
|
||||||
|
from nanobot.channels.telegram import _strip_md_block
|
||||||
|
|
||||||
|
text = "**bold** and _italic_ and ~~struck~~"
|
||||||
|
result = _strip_md_block(text)
|
||||||
|
assert result == "bold and italic and struck"
|
||||||
|
|
||||||
|
|
||||||
|
def test_strip_md_block_strips_headers() -> None:
|
||||||
|
from nanobot.channels.telegram import _strip_md_block
|
||||||
|
|
||||||
|
assert _strip_md_block("## Title\nBody") == "Title\nBody"
|
||||||
|
|
||||||
|
|
||||||
|
def test_strip_md_block_converts_bullets_and_numbers() -> None:
|
||||||
|
from nanobot.channels.telegram import _strip_md_block
|
||||||
|
|
||||||
|
text = "- item a\n1. item b\n2. item c"
|
||||||
|
result = _strip_md_block(text)
|
||||||
|
assert "\u2022 item a" in result
|
||||||
|
assert "1. item b" in result
|
||||||
|
assert "2. item c" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_strip_md_block_strips_links() -> None:
|
||||||
|
from nanobot.channels.telegram import _strip_md_block
|
||||||
|
|
||||||
|
assert _strip_md_block("[click here](https://example.com)") == "click here"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Streaming mid-edit uses _strip_md_block
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_send_delta_mid_stream_strips_markdown() -> None:
|
||||||
|
"""Mid-stream edits should strip markdown so users see clean text."""
|
||||||
|
channel = TelegramChannel(
|
||||||
|
TelegramConfig(enabled=True, token="123:abc", allow_from=["*"]),
|
||||||
|
MessageBus(),
|
||||||
|
)
|
||||||
|
channel._app = _FakeApp(lambda: None)
|
||||||
|
channel._app.bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=42))
|
||||||
|
channel._app.bot.edit_message_text = AsyncMock()
|
||||||
|
|
||||||
|
# Initial send with markdown
|
||||||
|
await channel.send_delta("999", "**hello** world")
|
||||||
|
sent_text = channel._app.bot.send_message.call_args.kwargs.get("text", "")
|
||||||
|
# Should NOT contain raw markdown asterisks
|
||||||
|
assert "**" not in sent_text
|
||||||
|
assert "hello world" in sent_text
|
||||||
|
|
||||||
|
# Mid-stream edit
|
||||||
|
import time
|
||||||
|
buf = channel._stream_bufs["999"]
|
||||||
|
buf.last_edit = time.monotonic() - 10 # force edit interval
|
||||||
|
await channel.send_delta("999", "\n### Title\n1. step")
|
||||||
|
edited_text = channel._app.bot.edit_message_text.call_args.kwargs.get("text", "")
|
||||||
|
assert "###" not in edited_text
|
||||||
|
assert "**" not in edited_text
|
||||||
|
assert "Title" in edited_text
|
||||||
|
assert "1. step" in edited_text
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user