From 219c32018b574b483603ece7c31df35d6a99fce0 Mon Sep 17 00:00:00 2001 From: Cyril Guilleminot Date: Mon, 13 Apr 2026 19:32:13 +0300 Subject: [PATCH] fix: UTF-8 decoding corrupts multi-byte characters in streaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SSE reader was calling resp.read(1) and decoding each single byte with utf-8. Multi-byte characters (é, à, 中, emoji, etc.) span 2–4 bytes, so each byte was individually replaced by U+FFFD, producing garbled output for any non-ASCII language. Fixed by reading 4 KiB chunks and feeding them through an incremental UTF-8 decoder, which correctly handles multi-byte sequences that span chunk boundaries. --- agent.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/agent.py b/agent.py index 0a97816..cb8a2c0 100755 --- a/agent.py +++ b/agent.py @@ -3,7 +3,7 @@ mac code — claude code for your Mac """ -import json, sys, os, time, subprocess, re, threading, queue +import json, sys, os, time, subprocess, re, threading, queue, codecs import urllib.request, random from datetime import datetime from pathlib import Path @@ -543,11 +543,13 @@ def stream_llm(messages): with urllib.request.urlopen(req, timeout=300) as resp: buf = "" + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") while True: - ch = resp.read(1) + ch = resp.read(4096) if not ch: + buf += decoder.decode(b"", final=True) break - buf += ch.decode("utf-8", errors="replace") + buf += decoder.decode(ch) while "\n" in buf: line, buf = buf.split("\n", 1) line = line.strip()