From f8a023218d8887abfb299e87e0073d6f1dd7a4f1 Mon Sep 17 00:00:00 2001 From: hussein1362 Date: Tue, 21 Apr 2026 10:44:06 +0300 Subject: [PATCH] fix(telegram): improve markdown rendering for modern LLM output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: Modern LLMs (GPT-5.4, Claude, Gemini) produce markdown-heavy responses with numbered lists, headers, and nested formatting. The Telegram channel's _markdown_to_telegram_html() converter has gaps that leave these poorly formatted: 1. Numbered lists (1. 2. 3.) have zero handling — sent as raw text 2. Headers (# Title) are stripped to plain text, losing visual hierarchy 3. Mid-stream edits send raw markdown (users see **bold** and ### headers while the response generates, before the final HTML conversion) Root Cause: _markdown_to_telegram_html() handles bullets (- *) but skips numbered lists entirely. Headers are stripped of # but not given any emphasis. The streaming path in send_delta() sends buf.text as-is during mid-stream edits (plain text, no parse_mode) — only the final _stream_end edit converts to HTML. Fix: 1. Headers now render as bold in the final HTML (using placeholder markers that survive HTML escaping, restored after all other processing) 2. Numbered lists are normalized (extra whitespace after the dot is cleaned) 3. New _strip_md_block() function strips markdown syntax for readable plain-text preview during streaming mid-edits The final _stream_end HTML conversion is unchanged — it still produces full HTML with parse_mode=HTML. Only the intermediate edits are improved. Tests: Added 10 new tests covering: - Headers converting to bold HTML - Numbered list preservation and whitespace normalization - Headers with HTML special characters - Mixed formatting (headers + bullets + numbers + bold) - _strip_md_block for inline formatting, headers, bullets, numbers, links - Streaming mid-edit markdown stripping (initial send + edit) --- nanobot/channels/telegram.py | 44 ++++++++- tests/channels/test_telegram_channel.py | 120 ++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 4 deletions(-) diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py index ca0639bc1..6925658de 100644 --- a/nanobot/channels/telegram.py +++ b/nanobot/channels/telegram.py @@ -53,6 +53,34 @@ def _strip_md(s: str) -> str: return s.strip() +def _strip_md_block(text: str) -> str: + """Strip block-level and inline markdown for readable plain-text preview. + + Used during streaming mid-edits so users see clean text instead of raw + markdown syntax while the response is still being generated. + """ + # Code blocks -> just the code + text = re.sub(r'```[\w]*\n?([\s\S]*?)```', r'\1', text) + # Headers -> plain text + text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE) + # Blockquotes + text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE) + # Bold / italic / strikethrough + text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) + text = re.sub(r'__(.+?)__', r'\1', text) + text = re.sub(r'(? text + text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) + # Bullet lists + text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE) + # Numbered lists (normalize spacing) + text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE) + return text + + def _render_table_box(table_lines: list[str]) -> str: """Convert markdown pipe-table to compact aligned text for
 display."""
 
@@ -129,8 +157,8 @@ def _markdown_to_telegram_html(text: str) -> str:
 
     text = re.sub(r'`([^`]+)`', save_inline_code, text)
 
-    # 3. Headers # Title -> just the title text
-    text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)
+    # 3. Headers # Title -> Title (preserve visual hierarchy)
+    text = re.sub(r'^#{1,6}\s+(.+)$', r'⟪B⟫\1⟪/B⟫', text, flags=re.MULTILINE)
 
     # 4. Blockquotes > text -> just the text (before HTML escaping)
     text = re.sub(r'^>\s*(.*)$', r'\1', text, flags=re.MULTILINE)
@@ -154,6 +182,9 @@ def _markdown_to_telegram_html(text: str) -> str:
     # 10. Bullet lists - item -> • item
     text = re.sub(r'^[-*]\s+', '• ', text, flags=re.MULTILINE)
 
+    # 10.5. Numbered lists  1. item -> 1. item (keep number, normalize indent)
+    text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
+
     # 11. Restore inline code with HTML tags
     for i, code in enumerate(inline_codes):
         # Escape HTML in code content
@@ -166,6 +197,9 @@ def _markdown_to_telegram_html(text: str) -> str:
         escaped = _escape_telegram_html(code)
         text = text.replace(f"\x00CB{i}\x00", f"
{escaped}
") + # 13. Restore header bold markers (inserted in step 3, after HTML escaping) + text = text.replace('⟪B⟫', '').replace('⟪/B⟫', '') + return text @@ -637,10 +671,11 @@ class TelegramChannel(BaseChannel): if message_thread_id := meta.get("message_thread_id"): thread_kwargs["message_thread_id"] = message_thread_id if buf.message_id is None: + preview = _strip_md_block(buf.text) try: sent = await self._call_with_retry( self._app.bot.send_message, - chat_id=int_chat_id, text=buf.text, + chat_id=int_chat_id, text=preview, **thread_kwargs, ) buf.message_id = sent.message_id @@ -653,11 +688,12 @@ class TelegramChannel(BaseChannel): await self._flush_stream_overflow(int_chat_id, buf, thread_kwargs) buf.last_edit = now return + preview = _strip_md_block(buf.text) try: await self._call_with_retry( self._app.bot.edit_message_text, chat_id=int_chat_id, message_id=buf.message_id, - text=buf.text, + text=preview, ) buf.last_edit = now except Exception as e: diff --git a/tests/channels/test_telegram_channel.py b/tests/channels/test_telegram_channel.py index e02ca5318..4a69d31a9 100644 --- a/tests/channels/test_telegram_channel.py +++ b/tests/channels/test_telegram_channel.py @@ -1471,3 +1471,123 @@ async def test_send_text_bad_request_plain_fallback_exhausted() -> None: # so HTML fails after 1 attempt → fallback to plain also fails after 1 attempt. # Before the fix: 2 total. After the fix: still 2 (BadRequest SHOULD fallback). assert call_count == 2, f"Expected 2 calls (1 HTML + 1 plain), got {call_count}" + + +# --------------------------------------------------------------------------- +# _markdown_to_telegram_html formatting tests +# --------------------------------------------------------------------------- + +def test_markdown_to_html_headers_become_bold() -> None: + from nanobot.channels.telegram import _markdown_to_telegram_html + + assert _markdown_to_telegram_html("# Title") == "Title" + assert _markdown_to_telegram_html("## Subtitle") == "Subtitle" + assert _markdown_to_telegram_html("### Deep") == "Deep" + + +def test_markdown_to_html_numbered_lists_preserved() -> None: + from nanobot.channels.telegram import _markdown_to_telegram_html + + text = "1. First\n2. Second\n3. Third" + result = _markdown_to_telegram_html(text) + assert "1. First" in result + assert "2. Second" in result + assert "3. Third" in result + + +def test_markdown_to_html_numbered_list_normalizes_whitespace() -> None: + from nanobot.channels.telegram import _markdown_to_telegram_html + + # Extra spaces after dot should be normalized + text = "1. Lots of space\n2. Two spaces" + result = _markdown_to_telegram_html(text) + assert "1. Lots of space" in result + assert "2. Two spaces" in result + + +def test_markdown_to_html_headers_survive_html_escaping() -> None: + """Headers containing special HTML chars should still render as bold.""" + from nanobot.channels.telegram import _markdown_to_telegram_html + + result = _markdown_to_telegram_html("# A < B & C > D") + assert "A < B & C > D" == result + + +def test_markdown_to_html_mixed_formatting() -> None: + """Headers, bullets, numbered lists, and bold coexist correctly.""" + from nanobot.channels.telegram import _markdown_to_telegram_html + + text = "# Overview\n\n- bullet one\n- bullet two\n\n1. step one\n2. step two\n\n**bold text**" + result = _markdown_to_telegram_html(text) + assert "Overview" in result + assert "\u2022 bullet one" in result + assert "1. step one" in result + assert "bold text" in result + + +# --------------------------------------------------------------------------- +# _strip_md_block tests +# --------------------------------------------------------------------------- + +def test_strip_md_block_removes_inline_formatting() -> None: + from nanobot.channels.telegram import _strip_md_block + + text = "**bold** and _italic_ and ~~struck~~" + result = _strip_md_block(text) + assert result == "bold and italic and struck" + + +def test_strip_md_block_strips_headers() -> None: + from nanobot.channels.telegram import _strip_md_block + + assert _strip_md_block("## Title\nBody") == "Title\nBody" + + +def test_strip_md_block_converts_bullets_and_numbers() -> None: + from nanobot.channels.telegram import _strip_md_block + + text = "- item a\n1. item b\n2. item c" + result = _strip_md_block(text) + assert "\u2022 item a" in result + assert "1. item b" in result + assert "2. item c" in result + + +def test_strip_md_block_strips_links() -> None: + from nanobot.channels.telegram import _strip_md_block + + assert _strip_md_block("[click here](https://example.com)") == "click here" + + +# --------------------------------------------------------------------------- +# Streaming mid-edit uses _strip_md_block +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_send_delta_mid_stream_strips_markdown() -> None: + """Mid-stream edits should strip markdown so users see clean text.""" + channel = TelegramChannel( + TelegramConfig(enabled=True, token="123:abc", allow_from=["*"]), + MessageBus(), + ) + channel._app = _FakeApp(lambda: None) + channel._app.bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=42)) + channel._app.bot.edit_message_text = AsyncMock() + + # Initial send with markdown + await channel.send_delta("999", "**hello** world") + sent_text = channel._app.bot.send_message.call_args.kwargs.get("text", "") + # Should NOT contain raw markdown asterisks + assert "**" not in sent_text + assert "hello world" in sent_text + + # Mid-stream edit + import time + buf = channel._stream_bufs["999"] + buf.last_edit = time.monotonic() - 10 # force edit interval + await channel.send_delta("999", "\n### Title\n1. step") + edited_text = channel._app.bot.edit_message_text.call_args.kwargs.get("text", "") + assert "###" not in edited_text + assert "**" not in edited_text + assert "Title" in edited_text + assert "1. step" in edited_text