fix(telegram): improve markdown rendering for modern LLM output

Problem:
Modern LLMs (GPT-5.4, Claude, Gemini) produce markdown-heavy responses with
numbered lists, headers, and nested formatting. The Telegram channel's
_markdown_to_telegram_html() converter has gaps that leave these poorly
formatted:

1. Numbered lists (1. 2. 3.) have zero handling — sent as raw text
2. Headers (# Title) are stripped to plain text, losing visual hierarchy
3. Mid-stream edits send raw markdown (users see **bold** and ### headers
   while the response generates, before the final HTML conversion)

Root Cause:
_markdown_to_telegram_html() handles bullets (- *) but skips numbered lists
entirely. Headers are stripped of # but not given any emphasis. The streaming
path in send_delta() sends buf.text as-is during mid-stream edits (plain
text, no parse_mode) — only the final _stream_end edit converts to HTML.

Fix:
1. Headers now render as <b>bold</b> in the final HTML (using placeholder
   markers that survive HTML escaping, restored after all other processing)
2. Numbered lists are normalized (extra whitespace after the dot is cleaned)
3. New _strip_md_block() function strips markdown syntax for readable
   plain-text preview during streaming mid-edits

The final _stream_end HTML conversion is unchanged — it still produces
full HTML with parse_mode=HTML. Only the intermediate edits are improved.

Tests:
Added 10 new tests covering:
- Headers converting to bold HTML
- Numbered list preservation and whitespace normalization
- Headers with HTML special characters
- Mixed formatting (headers + bullets + numbers + bold)
- _strip_md_block for inline formatting, headers, bullets, numbers, links
- Streaming mid-edit markdown stripping (initial send + edit)
This commit is contained in:
hussein1362 2026-04-21 10:44:06 +03:00 committed by Xubin Ren
parent 37ea8b8f5b
commit f8a023218d
2 changed files with 160 additions and 4 deletions

View File

@ -53,6 +53,34 @@ def _strip_md(s: str) -> str:
return s.strip()
def _strip_md_block(text: str) -> str:
"""Strip block-level and inline markdown for readable plain-text preview.
Used during streaming mid-edits so users see clean text instead of raw
markdown syntax while the response is still being generated.
"""
# Code blocks -> just the code
text = re.sub(r'```[\w]*\n?([\s\S]*?)```', r'\1', text)
# Headers -> plain text
text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
# Blockquotes
text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
# Bold / italic / strikethrough
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'__(.+?)__', r'\1', text)
text = re.sub(r'(?<![a-zA-Z0-9])_([^_]+)_(?![a-zA-Z0-9])', r'\1', text)
text = re.sub(r'~~(.+?)~~', r'\1', text)
# Inline code
text = re.sub(r'`([^`]+)`', r'\1', text)
# Links [text](url) -> text
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
# Bullet lists
text = re.sub(r'^[-*]\s+', '', text, flags=re.MULTILINE)
# Numbered lists (normalize spacing)
text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
return text
def _render_table_box(table_lines: list[str]) -> str:
"""Convert markdown pipe-table to compact aligned text for <pre> display."""
@ -129,8 +157,8 @@ def _markdown_to_telegram_html(text: str) -> str:
text = re.sub(r'`([^`]+)`', save_inline_code, text)
# 3. Headers # Title -> just the title text
text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
# 3. Headers # Title -> <b>Title</b> (preserve visual hierarchy)
text = re.sub(r'^#{1,6}\s+(.+)$', r'⟪B⟫\1⟪/B⟫', text, flags=re.MULTILINE)
# 4. Blockquotes > text -> just the text (before HTML escaping)
text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
@ -154,6 +182,9 @@ def _markdown_to_telegram_html(text: str) -> str:
# 10. Bullet lists - item -> • item
text = re.sub(r'^[-*]\s+', '', text, flags=re.MULTILINE)
# 10.5. Numbered lists 1. item -> 1. item (keep number, normalize indent)
text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
# 11. Restore inline code with HTML tags
for i, code in enumerate(inline_codes):
# Escape HTML in code content
@ -166,6 +197,9 @@ def _markdown_to_telegram_html(text: str) -> str:
escaped = _escape_telegram_html(code)
text = text.replace(f"\x00CB{i}\x00", f"<pre><code>{escaped}</code></pre>")
# 13. Restore header bold markers (inserted in step 3, after HTML escaping)
text = text.replace('⟪B⟫', '<b>').replace('⟪/B⟫', '</b>')
return text
@ -637,10 +671,11 @@ class TelegramChannel(BaseChannel):
if message_thread_id := meta.get("message_thread_id"):
thread_kwargs["message_thread_id"] = message_thread_id
if buf.message_id is None:
preview = _strip_md_block(buf.text)
try:
sent = await self._call_with_retry(
self._app.bot.send_message,
chat_id=int_chat_id, text=buf.text,
chat_id=int_chat_id, text=preview,
**thread_kwargs,
)
buf.message_id = sent.message_id
@ -653,11 +688,12 @@ class TelegramChannel(BaseChannel):
await self._flush_stream_overflow(int_chat_id, buf, thread_kwargs)
buf.last_edit = now
return
preview = _strip_md_block(buf.text)
try:
await self._call_with_retry(
self._app.bot.edit_message_text,
chat_id=int_chat_id, message_id=buf.message_id,
text=buf.text,
text=preview,
)
buf.last_edit = now
except Exception as e:

View File

@ -1471,3 +1471,123 @@ async def test_send_text_bad_request_plain_fallback_exhausted() -> None:
# so HTML fails after 1 attempt → fallback to plain also fails after 1 attempt.
# Before the fix: 2 total. After the fix: still 2 (BadRequest SHOULD fallback).
assert call_count == 2, f"Expected 2 calls (1 HTML + 1 plain), got {call_count}"
# ---------------------------------------------------------------------------
# _markdown_to_telegram_html formatting tests
# ---------------------------------------------------------------------------
def test_markdown_to_html_headers_become_bold() -> None:
from nanobot.channels.telegram import _markdown_to_telegram_html
assert _markdown_to_telegram_html("# Title") == "<b>Title</b>"
assert _markdown_to_telegram_html("## Subtitle") == "<b>Subtitle</b>"
assert _markdown_to_telegram_html("### Deep") == "<b>Deep</b>"
def test_markdown_to_html_numbered_lists_preserved() -> None:
from nanobot.channels.telegram import _markdown_to_telegram_html
text = "1. First\n2. Second\n3. Third"
result = _markdown_to_telegram_html(text)
assert "1. First" in result
assert "2. Second" in result
assert "3. Third" in result
def test_markdown_to_html_numbered_list_normalizes_whitespace() -> None:
from nanobot.channels.telegram import _markdown_to_telegram_html
# Extra spaces after dot should be normalized
text = "1. Lots of space\n2. Two spaces"
result = _markdown_to_telegram_html(text)
assert "1. Lots of space" in result
assert "2. Two spaces" in result
def test_markdown_to_html_headers_survive_html_escaping() -> None:
"""Headers containing special HTML chars should still render as bold."""
from nanobot.channels.telegram import _markdown_to_telegram_html
result = _markdown_to_telegram_html("# A < B & C > D")
assert "<b>A &lt; B &amp; C &gt; D</b>" == result
def test_markdown_to_html_mixed_formatting() -> None:
"""Headers, bullets, numbered lists, and bold coexist correctly."""
from nanobot.channels.telegram import _markdown_to_telegram_html
text = "# Overview\n\n- bullet one\n- bullet two\n\n1. step one\n2. step two\n\n**bold text**"
result = _markdown_to_telegram_html(text)
assert "<b>Overview</b>" in result
assert "\u2022 bullet one" in result
assert "1. step one" in result
assert "<b>bold text</b>" in result
# ---------------------------------------------------------------------------
# _strip_md_block tests
# ---------------------------------------------------------------------------
def test_strip_md_block_removes_inline_formatting() -> None:
from nanobot.channels.telegram import _strip_md_block
text = "**bold** and _italic_ and ~~struck~~"
result = _strip_md_block(text)
assert result == "bold and italic and struck"
def test_strip_md_block_strips_headers() -> None:
from nanobot.channels.telegram import _strip_md_block
assert _strip_md_block("## Title\nBody") == "Title\nBody"
def test_strip_md_block_converts_bullets_and_numbers() -> None:
from nanobot.channels.telegram import _strip_md_block
text = "- item a\n1. item b\n2. item c"
result = _strip_md_block(text)
assert "\u2022 item a" in result
assert "1. item b" in result
assert "2. item c" in result
def test_strip_md_block_strips_links() -> None:
from nanobot.channels.telegram import _strip_md_block
assert _strip_md_block("[click here](https://example.com)") == "click here"
# ---------------------------------------------------------------------------
# Streaming mid-edit uses _strip_md_block
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_send_delta_mid_stream_strips_markdown() -> None:
"""Mid-stream edits should strip markdown so users see clean text."""
channel = TelegramChannel(
TelegramConfig(enabled=True, token="123:abc", allow_from=["*"]),
MessageBus(),
)
channel._app = _FakeApp(lambda: None)
channel._app.bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=42))
channel._app.bot.edit_message_text = AsyncMock()
# Initial send with markdown
await channel.send_delta("999", "**hello** world")
sent_text = channel._app.bot.send_message.call_args.kwargs.get("text", "")
# Should NOT contain raw markdown asterisks
assert "**" not in sent_text
assert "hello world" in sent_text
# Mid-stream edit
import time
buf = channel._stream_bufs["999"]
buf.last_edit = time.monotonic() - 10 # force edit interval
await channel.send_delta("999", "\n### Title\n1. step")
edited_text = channel._app.bot.edit_message_text.call_args.kwargs.get("text", "")
assert "###" not in edited_text
assert "**" not in edited_text
assert "Title" in edited_text
assert "1. step" in edited_text