mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-21 17:12:32 +00:00
fix(signal): emit textStyle offsets in UTF-16 code units
Signal's BodyRange (via signal-cli's textStyle) interprets start/length as UTF-16 code units, but the Phase-3 assembly used Python's len(), which counts code points. A single non-BMP character (e.g. an emoji) earlier in a message shifted every subsequent styled span left by one unit, dropping the last letter of bold/italic words. Track a running UTF-16 offset in the assembly loop and add regression tests covering emojis, supplementary CJK, ZWJ sequences, and a multi-section message that mirrors the reported failure. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
08154b4374
commit
8a2a5eecdd
@ -43,6 +43,11 @@ _SIG_STRIKE_RE = re.compile(r'~~(.+?)~~|(?<![~\w])~([^~\n]+)~(?![~\w])', re.DOTA
|
||||
_SIG_TOKEN_RE = re.compile(r'\x00C(\d+)\x00')
|
||||
|
||||
|
||||
def _utf16_len(s: str) -> int:
|
||||
"""UTF-16 code-unit length, matching Signal BodyRange semantics."""
|
||||
return len(s.encode("utf-16-le")) // 2
|
||||
|
||||
|
||||
def _sig_strip_cell(s: str) -> str:
|
||||
"""Strip inline markdown from a table cell for plain-text rendering."""
|
||||
s = re.sub(r'\*\*(.+?)\*\*', r'\1', s)
|
||||
@ -183,15 +188,20 @@ def _markdown_to_signal(text: str) -> tuple[str, list[str]]:
|
||||
# Strikethrough: ~~text~~ (standard) or ~text~ (single-tilde variant).
|
||||
transform(_SIG_STRIKE_RE, lambda m, s: [_Run(m.group(1) or m.group(2), s | {"STRIKETHROUGH"})])
|
||||
|
||||
# Phase 3: assemble output.
|
||||
# Phase 3: assemble output. Offsets and lengths are emitted in UTF-16 code
|
||||
# units because Signal's BodyRange (via signal-cli's textStyle) interprets
|
||||
# them as such; Python's len() counts code points, which would shift ranges
|
||||
# left by 1 unit per non-BMP character preceding them.
|
||||
plain_text = ""
|
||||
text_styles: list[str] = []
|
||||
utf16_offset = 0
|
||||
for run in runs:
|
||||
if not run.text:
|
||||
continue
|
||||
start = len(plain_text)
|
||||
plain_text += run.text
|
||||
length = len(plain_text) - start
|
||||
start = utf16_offset
|
||||
length = _utf16_len(run.text)
|
||||
utf16_offset += length
|
||||
for style in sorted(run.styles):
|
||||
text_styles.append(f"{start}:{length}:{style}")
|
||||
|
||||
|
||||
@ -3,6 +3,10 @@
|
||||
from nanobot.channels.signal import _markdown_to_signal
|
||||
|
||||
|
||||
def _utf16_len(s: str) -> int:
|
||||
return len(s.encode("utf-16-le")) // 2
|
||||
|
||||
|
||||
def styles_for(plain: str, text_styles: list[str]) -> dict[str, list[str]]:
|
||||
"""Return a dict mapping each styled substring to its style list."""
|
||||
result: dict[str, list[str]] = {}
|
||||
@ -14,6 +18,18 @@ def styles_for(plain: str, text_styles: list[str]) -> dict[str, list[str]]:
|
||||
return result
|
||||
|
||||
|
||||
def utf16_styles_for(plain: str, text_styles: list[str]) -> dict[str, list[str]]:
|
||||
"""Like styles_for, but slices `plain` using UTF-16 offsets (Signal's units)."""
|
||||
encoded = plain.encode("utf-16-le")
|
||||
result: dict[str, list[str]] = {}
|
||||
for entry in text_styles:
|
||||
start_s, length_s, style = entry.split(":", 2)
|
||||
start, length = int(start_s), int(length_s)
|
||||
span = encoded[start * 2 : (start + length) * 2].decode("utf-16-le")
|
||||
result.setdefault(span, []).append(style)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Basic cases
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -242,3 +258,96 @@ def test_style_ranges_are_within_bounds():
|
||||
start, length = int(start_s), int(length_s)
|
||||
assert start >= 0
|
||||
assert start + length <= len(plain)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Non-BMP / UTF-16 offsets
|
||||
#
|
||||
# Signal's BodyRange (and signal-cli's textStyle) interprets start/length in
|
||||
# UTF-16 code units. Python's len() counts code points, so characters outside
|
||||
# the BMP (emojis, supplementary CJK) shift offsets by +1 per occurrence.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def assert_within_utf16_bounds(plain: str, styles: list[str]) -> None:
|
||||
limit = _utf16_len(plain)
|
||||
for entry in styles:
|
||||
start_s, length_s, _ = entry.split(":", 2)
|
||||
start, length = int(start_s), int(length_s)
|
||||
assert start >= 0
|
||||
assert start + length <= limit, (
|
||||
f"range {entry} exceeds utf-16 length {limit} of {plain!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_bold_with_emoji_inside():
|
||||
plain, styles = _markdown_to_signal("**hi 🎉 bye**")
|
||||
assert plain == "hi 🎉 bye"
|
||||
assert utf16_styles_for(plain, styles) == {"hi 🎉 bye": ["BOLD"]}
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
|
||||
def test_italic_with_trailing_emoji():
|
||||
plain, styles = _markdown_to_signal("*bye 🎉*")
|
||||
assert plain == "bye 🎉"
|
||||
assert utf16_styles_for(plain, styles) == {"bye 🎉": ["ITALIC"]}
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
|
||||
def test_bold_after_emoji_prefix():
|
||||
plain, styles = _markdown_to_signal("🎉 **bold**")
|
||||
assert plain == "🎉 bold"
|
||||
assert utf16_styles_for(plain, styles) == {"bold": ["BOLD"]}
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
|
||||
def test_bold_after_and_inside_emoji():
|
||||
plain, styles = _markdown_to_signal("🎉 **a 🎊 b**")
|
||||
assert plain == "🎉 a 🎊 b"
|
||||
assert utf16_styles_for(plain, styles) == {"a 🎊 b": ["BOLD"]}
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
|
||||
def test_supplementary_cjk_in_bold():
|
||||
"""Non-BMP CJK (U+20BB7) proves the bug is UTF-16, not emoji-specific."""
|
||||
plain, styles = _markdown_to_signal("**𠮷野家**")
|
||||
assert plain == "𠮷野家"
|
||||
assert utf16_styles_for(plain, styles) == {"𠮷野家": ["BOLD"]}
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
|
||||
def test_zwj_emoji_in_bold():
|
||||
"""ZWJ family sequence = multiple surrogate pairs + BMP ZWJs."""
|
||||
plain, styles = _markdown_to_signal("**hi 👨👩👧 bye**")
|
||||
assert plain == "hi 👨👩👧 bye"
|
||||
assert utf16_styles_for(plain, styles) == {"hi 👨👩👧 bye": ["BOLD"]}
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
|
||||
def test_ascii_offsets_unchanged():
|
||||
"""ASCII-only path must produce the same offsets as before the UTF-16 fix."""
|
||||
plain, styles = _markdown_to_signal("**bold** plain *it*")
|
||||
assert plain == "bold plain it"
|
||||
assert sorted(styles) == sorted(["0:4:BOLD", "11:2:ITALIC"])
|
||||
|
||||
|
||||
def test_reported_daily_brief_pattern():
|
||||
"""Regression for the reported bug: a single non-BMP emoji shifts every
|
||||
subsequent styled span left by 1 UTF-16 unit, lopping off the last letter.
|
||||
"""
|
||||
md = (
|
||||
"**Weather**\n"
|
||||
"- Conditions: 🌩️ Thunderstorms\n\n"
|
||||
"**News**\n"
|
||||
"*World*\n"
|
||||
"*Local*\n\n"
|
||||
"**Quote of the Day**"
|
||||
)
|
||||
plain, styles = _markdown_to_signal(md)
|
||||
sd = utf16_styles_for(plain, styles)
|
||||
assert sd.get("Weather") == ["BOLD"]
|
||||
assert sd.get("News") == ["BOLD"]
|
||||
assert sd.get("World") == ["ITALIC"]
|
||||
assert sd.get("Local") == ["ITALIC"]
|
||||
assert sd.get("Quote of the Day") == ["BOLD"]
|
||||
assert_within_utf16_bounds(plain, styles)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user