fix(telegram): improve markdown rendering for modern LLM output

Problem: Modern LLMs (GPT-5.4, Claude, Gemini) produce markdown-heavy responses with numbered lists, headers, and nested formatting. The Telegram channel's _markdown_to_telegram_html() converter has gaps that leave these poorly formatted: 1. Numbered lists (1. 2. 3.) have zero handling — sent as raw text 2. Headers (# Title) are stripped to plain text, losing visual hierarchy 3. Mid-stream edits send raw markdown (users see **bold** and ### headers while the response generates, before the final HTML conversion) Root Cause: _markdown_to_telegram_html() handles bullets (- *) but skips numbered lists entirely. Headers are stripped of # but not given any emphasis. The streaming path in send_delta() sends buf.text as-is during mid-stream edits (plain text, no parse_mode) — only the final _stream_end edit converts to HTML. Fix: 1. Headers now render as <b>bold</b> in the final HTML (using placeholder markers that survive HTML escaping, restored after all other processing) 2. Numbered lists are normalized (extra whitespace after the dot is cleaned) 3. New _strip_md_block() function strips markdown syntax for readable plain-text preview during streaming mid-edits The final _stream_end HTML conversion is unchanged — it still produces full HTML with parse_mode=HTML. Only the intermediate edits are improved. Tests: Added 10 new tests covering: - Headers converting to bold HTML - Numbered list preservation and whitespace normalization - Headers with HTML special characters - Mixed formatting (headers + bullets + numbers + bold) - _strip_md_block for inline formatting, headers, bullets, numbers, links - Streaming mid-edit markdown stripping (initial send + edit)
2026-06-23 03:04:06 +00:00 · 2026-04-21 10:44:06 +03:00 · 2026-04-21 10:44:06 +03:00 · f8a023218d
commit f8a023218d
parent 37ea8b8f5b
2 changed files with 160 additions and 4 deletions
--- a/nanobot/channels/telegram.py
+++ b/nanobot/channels/telegram.py
@ -53,6 +53,34 @@ def _strip_md(s: str) -> str:
    return s.strip()


+def _strip_md_block(text: str) -> str:
+    """Strip block-level and inline markdown for readable plain-text preview.
+
+    Used during streaming mid-edits so users see clean text instead of raw
+    markdown syntax while the response is still being generated.
+    """
+    # Code blocks -> just the code
+    text = re.sub(r'```[\w]*\n?([\s\S]*?)```', r'\1', text)
+    # Headers -> plain text
+    text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
+    # Blockquotes
+    text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
+    # Bold / italic / strikethrough
+    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
+    text = re.sub(r'__(.+?)__', r'\1', text)
+    text = re.sub(r'(?<![a-zA-Z0-9])_([^_]+)_(?![a-zA-Z0-9])', r'\1', text)
+    text = re.sub(r'~~(.+?)~~', r'\1', text)
+    # Inline code
+    text = re.sub(r'`([^`]+)`', r'\1', text)
+    # Links [text](url) -> text
+    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
+    # Bullet lists
+    text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE)
+    # Numbered lists (normalize spacing)
+    text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
+    return text
+
+
 def _render_table_box(table_lines: list[str]) -> str:
    """Convert markdown pipe-table to compact aligned text for <pre> display."""

@ -129,8 +157,8 @@ def _markdown_to_telegram_html(text: str) -> str:

    text = re.sub(r'`([^`]+)`', save_inline_code, text)

-    # 3. Headers # Title -> just the title text
-    text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
+    # 3. Headers # Title -> <b>Title</b> (preserve visual hierarchy)
+    text = re.sub(r'^#{1,6}\s+(.+)$', r'⟪B⟫\1⟪/B⟫', text, flags=re.MULTILINE)

    # 4. Blockquotes > text -> just the text (before HTML escaping)
    text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
@ -154,6 +182,9 @@ def _markdown_to_telegram_html(text: str) -> str:
    # 10. Bullet lists - item -> • item
    text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE)

+    # 10.5. Numbered lists  1. item -> 1. item (keep number, normalize indent)
+    text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
+
    # 11. Restore inline code with HTML tags
    for i, code in enumerate(inline_codes):
        # Escape HTML in code content
@ -166,6 +197,9 @@ def _markdown_to_telegram_html(text: str) -> str:
        escaped = _escape_telegram_html(code)
        text = text.replace(f"\x00CB{i}\x00", f"<pre><code>{escaped}</code></pre>")

+    # 13. Restore header bold markers (inserted in step 3, after HTML escaping)
+    text = text.replace('⟪B⟫', '<b>').replace('⟪/B⟫', '</b>')
+
    return text


@ -637,10 +671,11 @@ class TelegramChannel(BaseChannel):
        if message_thread_id := meta.get("message_thread_id"):
            thread_kwargs["message_thread_id"] = message_thread_id
        if buf.message_id is None:
+            preview = _strip_md_block(buf.text)
            try:
                sent = await self._call_with_retry(
                    self._app.bot.send_message,
-                    chat_id=int_chat_id, text=buf.text,
+                    chat_id=int_chat_id, text=preview,
                    **thread_kwargs,
                )
                buf.message_id = sent.message_id
@ -653,11 +688,12 @@ class TelegramChannel(BaseChannel):
                await self._flush_stream_overflow(int_chat_id, buf, thread_kwargs)
                buf.last_edit = now
                return
+            preview = _strip_md_block(buf.text)
            try:
                await self._call_with_retry(
                    self._app.bot.edit_message_text,
                    chat_id=int_chat_id, message_id=buf.message_id,
-                    text=buf.text,
+                    text=preview,
                )
                buf.last_edit = now
            except Exception as e:
--- a/tests/channels/test_telegram_channel.py
+++ b/tests/channels/test_telegram_channel.py
@ -1471,3 +1471,123 @@ async def test_send_text_bad_request_plain_fallback_exhausted() -> None:
    # so HTML fails after 1 attempt → fallback to plain also fails after 1 attempt.
    # Before the fix: 2 total. After the fix: still 2 (BadRequest SHOULD fallback).
    assert call_count == 2, f"Expected 2 calls (1 HTML + 1 plain), got {call_count}"
+
+
+# ---------------------------------------------------------------------------
+# _markdown_to_telegram_html formatting tests
+# ---------------------------------------------------------------------------
+
+def test_markdown_to_html_headers_become_bold() -> None:
+    from nanobot.channels.telegram import _markdown_to_telegram_html
+
+    assert _markdown_to_telegram_html("# Title") == "<b>Title</b>"
+    assert _markdown_to_telegram_html("## Subtitle") == "<b>Subtitle</b>"
+    assert _markdown_to_telegram_html("### Deep") == "<b>Deep</b>"
+
+
+def test_markdown_to_html_numbered_lists_preserved() -> None:
+    from nanobot.channels.telegram import _markdown_to_telegram_html
+
+    text = "1. First\n2. Second\n3. Third"
+    result = _markdown_to_telegram_html(text)
+    assert "1. First" in result
+    assert "2. Second" in result
+    assert "3. Third" in result
+
+
+def test_markdown_to_html_numbered_list_normalizes_whitespace() -> None:
+    from nanobot.channels.telegram import _markdown_to_telegram_html
+
+    # Extra spaces after dot should be normalized
+    text = "1.   Lots of space\n2.  Two spaces"
+    result = _markdown_to_telegram_html(text)
+    assert "1. Lots of space" in result
+    assert "2. Two spaces" in result
+
+
+def test_markdown_to_html_headers_survive_html_escaping() -> None:
+    """Headers containing special HTML chars should still render as bold."""
+    from nanobot.channels.telegram import _markdown_to_telegram_html
+
+    result = _markdown_to_telegram_html("# A < B & C > D")
+    assert "<b>A &lt; B &amp; C &gt; D</b>" == result
+
+
+def test_markdown_to_html_mixed_formatting() -> None:
+    """Headers, bullets, numbered lists, and bold coexist correctly."""
+    from nanobot.channels.telegram import _markdown_to_telegram_html
+
+    text = "# Overview\n\n- bullet one\n- bullet two\n\n1. step one\n2. step two\n\n**bold text**"
+    result = _markdown_to_telegram_html(text)
+    assert "<b>Overview</b>" in result
+    assert "\u2022 bullet one" in result
+    assert "1. step one" in result
+    assert "<b>bold text</b>" in result
+
+
+# ---------------------------------------------------------------------------
+# _strip_md_block tests
+# ---------------------------------------------------------------------------
+
+def test_strip_md_block_removes_inline_formatting() -> None:
+    from nanobot.channels.telegram import _strip_md_block
+
+    text = "**bold** and _italic_ and ~~struck~~"
+    result = _strip_md_block(text)
+    assert result == "bold and italic and struck"
+
+
+def test_strip_md_block_strips_headers() -> None:
+    from nanobot.channels.telegram import _strip_md_block
+
+    assert _strip_md_block("## Title\nBody") == "Title\nBody"
+
+
+def test_strip_md_block_converts_bullets_and_numbers() -> None:
+    from nanobot.channels.telegram import _strip_md_block
+
+    text = "- item a\n1. item b\n2. item c"
+    result = _strip_md_block(text)
+    assert "\u2022 item a" in result
+    assert "1. item b" in result
+    assert "2. item c" in result
+
+
+def test_strip_md_block_strips_links() -> None:
+    from nanobot.channels.telegram import _strip_md_block
+
+    assert _strip_md_block("[click here](https://example.com)") == "click here"
+
+
+# ---------------------------------------------------------------------------
+# Streaming mid-edit uses _strip_md_block
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_send_delta_mid_stream_strips_markdown() -> None:
+    """Mid-stream edits should strip markdown so users see clean text."""
+    channel = TelegramChannel(
+        TelegramConfig(enabled=True, token="123:abc", allow_from=["*"]),
+        MessageBus(),
+    )
+    channel._app = _FakeApp(lambda: None)
+    channel._app.bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=42))
+    channel._app.bot.edit_message_text = AsyncMock()
+
+    # Initial send with markdown
+    await channel.send_delta("999", "**hello** world")
+    sent_text = channel._app.bot.send_message.call_args.kwargs.get("text", "")
+    # Should NOT contain raw markdown asterisks
+    assert "**" not in sent_text
+    assert "hello world" in sent_text
+
+    # Mid-stream edit
+    import time
+    buf = channel._stream_bufs["999"]
+    buf.last_edit = time.monotonic() - 10  # force edit interval
+    await channel.send_delta("999", "\n### Title\n1. step")
+    edited_text = channel._app.bot.edit_message_text.call_args.kwargs.get("text", "")
+    assert "###" not in edited_text
+    assert "**" not in edited_text
+    assert "Title" in edited_text
+    assert "1. step" in edited_text