From 8a2a5eecdd6936338ca07a5a00d0e709cd83eab7 Mon Sep 17 00:00:00 2001 From: Kaloyan Tenchov Date: Sat, 16 May 2026 10:51:57 -0400 Subject: [PATCH] fix(signal): emit textStyle offsets in UTF-16 code units Signal's BodyRange (via signal-cli's textStyle) interprets start/length as UTF-16 code units, but the Phase-3 assembly used Python's len(), which counts code points. A single non-BMP character (e.g. an emoji) earlier in a message shifted every subsequent styled span left by one unit, dropping the last letter of bold/italic words. Track a running UTF-16 offset in the assembly loop and add regression tests covering emojis, supplementary CJK, ZWJ sequences, and a multi-section message that mirrors the reported failure. Co-Authored-By: Claude Opus 4.7 --- nanobot/channels/signal.py | 16 +++- tests/channels/test_signal_markdown.py | 109 +++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 3 deletions(-) diff --git a/nanobot/channels/signal.py b/nanobot/channels/signal.py index 45c520291..84bec425d 100644 --- a/nanobot/channels/signal.py +++ b/nanobot/channels/signal.py @@ -43,6 +43,11 @@ _SIG_STRIKE_RE = re.compile(r'~~(.+?)~~|(? int: + """UTF-16 code-unit length, matching Signal BodyRange semantics.""" + return len(s.encode("utf-16-le")) // 2 + + def _sig_strip_cell(s: str) -> str: """Strip inline markdown from a table cell for plain-text rendering.""" s = re.sub(r'\*\*(.+?)\*\*', r'\1', s) @@ -183,15 +188,20 @@ def _markdown_to_signal(text: str) -> tuple[str, list[str]]: # Strikethrough: ~~text~~ (standard) or ~text~ (single-tilde variant). transform(_SIG_STRIKE_RE, lambda m, s: [_Run(m.group(1) or m.group(2), s | {"STRIKETHROUGH"})]) - # Phase 3: assemble output. + # Phase 3: assemble output. Offsets and lengths are emitted in UTF-16 code + # units because Signal's BodyRange (via signal-cli's textStyle) interprets + # them as such; Python's len() counts code points, which would shift ranges + # left by 1 unit per non-BMP character preceding them. plain_text = "" text_styles: list[str] = [] + utf16_offset = 0 for run in runs: if not run.text: continue - start = len(plain_text) plain_text += run.text - length = len(plain_text) - start + start = utf16_offset + length = _utf16_len(run.text) + utf16_offset += length for style in sorted(run.styles): text_styles.append(f"{start}:{length}:{style}") diff --git a/tests/channels/test_signal_markdown.py b/tests/channels/test_signal_markdown.py index 15eca70ff..36b75f163 100644 --- a/tests/channels/test_signal_markdown.py +++ b/tests/channels/test_signal_markdown.py @@ -3,6 +3,10 @@ from nanobot.channels.signal import _markdown_to_signal +def _utf16_len(s: str) -> int: + return len(s.encode("utf-16-le")) // 2 + + def styles_for(plain: str, text_styles: list[str]) -> dict[str, list[str]]: """Return a dict mapping each styled substring to its style list.""" result: dict[str, list[str]] = {} @@ -14,6 +18,18 @@ def styles_for(plain: str, text_styles: list[str]) -> dict[str, list[str]]: return result +def utf16_styles_for(plain: str, text_styles: list[str]) -> dict[str, list[str]]: + """Like styles_for, but slices `plain` using UTF-16 offsets (Signal's units).""" + encoded = plain.encode("utf-16-le") + result: dict[str, list[str]] = {} + for entry in text_styles: + start_s, length_s, style = entry.split(":", 2) + start, length = int(start_s), int(length_s) + span = encoded[start * 2 : (start + length) * 2].decode("utf-16-le") + result.setdefault(span, []).append(style) + return result + + # --------------------------------------------------------------------------- # Basic cases # --------------------------------------------------------------------------- @@ -242,3 +258,96 @@ def test_style_ranges_are_within_bounds(): start, length = int(start_s), int(length_s) assert start >= 0 assert start + length <= len(plain) + + +# --------------------------------------------------------------------------- +# Non-BMP / UTF-16 offsets +# +# Signal's BodyRange (and signal-cli's textStyle) interprets start/length in +# UTF-16 code units. Python's len() counts code points, so characters outside +# the BMP (emojis, supplementary CJK) shift offsets by +1 per occurrence. +# --------------------------------------------------------------------------- + + +def assert_within_utf16_bounds(plain: str, styles: list[str]) -> None: + limit = _utf16_len(plain) + for entry in styles: + start_s, length_s, _ = entry.split(":", 2) + start, length = int(start_s), int(length_s) + assert start >= 0 + assert start + length <= limit, ( + f"range {entry} exceeds utf-16 length {limit} of {plain!r}" + ) + + +def test_bold_with_emoji_inside(): + plain, styles = _markdown_to_signal("**hi 🎉 bye**") + assert plain == "hi 🎉 bye" + assert utf16_styles_for(plain, styles) == {"hi 🎉 bye": ["BOLD"]} + assert_within_utf16_bounds(plain, styles) + + +def test_italic_with_trailing_emoji(): + plain, styles = _markdown_to_signal("*bye 🎉*") + assert plain == "bye 🎉" + assert utf16_styles_for(plain, styles) == {"bye 🎉": ["ITALIC"]} + assert_within_utf16_bounds(plain, styles) + + +def test_bold_after_emoji_prefix(): + plain, styles = _markdown_to_signal("🎉 **bold**") + assert plain == "🎉 bold" + assert utf16_styles_for(plain, styles) == {"bold": ["BOLD"]} + assert_within_utf16_bounds(plain, styles) + + +def test_bold_after_and_inside_emoji(): + plain, styles = _markdown_to_signal("🎉 **a 🎊 b**") + assert plain == "🎉 a 🎊 b" + assert utf16_styles_for(plain, styles) == {"a 🎊 b": ["BOLD"]} + assert_within_utf16_bounds(plain, styles) + + +def test_supplementary_cjk_in_bold(): + """Non-BMP CJK (U+20BB7) proves the bug is UTF-16, not emoji-specific.""" + plain, styles = _markdown_to_signal("**𠮷野åŽļ**") + assert plain == "𠮷野åŽļ" + assert utf16_styles_for(plain, styles) == {"𠮷野åŽļ": ["BOLD"]} + assert_within_utf16_bounds(plain, styles) + + +def test_zwj_emoji_in_bold(): + """ZWJ family sequence = multiple surrogate pairs + BMP ZWJs.""" + plain, styles = _markdown_to_signal("**hi 👨‍👩‍👧 bye**") + assert plain == "hi 👨‍👩‍👧 bye" + assert utf16_styles_for(plain, styles) == {"hi 👨‍👩‍👧 bye": ["BOLD"]} + assert_within_utf16_bounds(plain, styles) + + +def test_ascii_offsets_unchanged(): + """ASCII-only path must produce the same offsets as before the UTF-16 fix.""" + plain, styles = _markdown_to_signal("**bold** plain *it*") + assert plain == "bold plain it" + assert sorted(styles) == sorted(["0:4:BOLD", "11:2:ITALIC"]) + + +def test_reported_daily_brief_pattern(): + """Regression for the reported bug: a single non-BMP emoji shifts every + subsequent styled span left by 1 UTF-16 unit, lopping off the last letter. + """ + md = ( + "**Weather**\n" + "- Conditions: đŸŒŠī¸ Thunderstorms\n\n" + "**News**\n" + "*World*\n" + "*Local*\n\n" + "**Quote of the Day**" + ) + plain, styles = _markdown_to_signal(md) + sd = utf16_styles_for(plain, styles) + assert sd.get("Weather") == ["BOLD"] + assert sd.get("News") == ["BOLD"] + assert sd.get("World") == ["ITALIC"] + assert sd.get("Local") == ["ITALIC"] + assert sd.get("Quote of the Day") == ["BOLD"] + assert_within_utf16_bounds(plain, styles)