From dedb010dd77432531bc5bcc67854b2936a88298e Mon Sep 17 00:00:00 2001
From: Earlopain <14981592+Earlopain@users.noreply.github.com>
Date: Fri, 8 May 2026 21:04:08 +0200
Subject: [PATCH 1/2] Respect `encoding` option in `Prism.lex` and friends

utf-8 is the default for source files but can be overwritten via options
---
 ext/prism/extension.c  |  2 +-
 test/prism/lex_test.rb | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/ext/prism/extension.c b/ext/prism/extension.c
index 9f9169cfff..27df8dac50 100644
--- a/ext/prism/extension.c
+++ b/ext/prism/extension.c
@@ -793,7 +793,7 @@ parse_lex_input(const uint8_t *input, size_t input_length, const pm_options_t *o
     parse_lex_data_t parse_lex_data = {
         .source = source,
         .tokens = rb_ary_new(),
-        .encoding = rb_utf8_encoding(),
+        .encoding = rb_enc_find(pm_parser_encoding_name(parser)),
         .freeze = pm_options_freeze(options),
     };
 
diff --git a/test/prism/lex_test.rb b/test/prism/lex_test.rb
index 8ea7ce7e9b..1e06d52184 100644
--- a/test/prism/lex_test.rb
+++ b/test/prism/lex_test.rb
@@ -47,6 +47,24 @@ def test_parse_lex_file
       end
     end
 
+    def test_lex_encoding
+      tokens = Prism.lex('"わたし"', encoding: Encoding::Windows_31J).value
+      tokens.each do |t|
+        assert_equal(Encoding::Windows_31J, t[0].value.encoding)
+      end
+
+      # Shebangs must appear on the first line. For these cases, the encoding
+      # comment may appear second, but it should still change encoding.
+      tokens = Prism.lex(<<~RUBY, encoding: Encoding::Windows_31J).value
+        #! /usr/bin/env ruby
+        # encoding: utf-8
+        "わたし"
+      RUBY
+      tokens.each do |t|
+        assert_equal(Encoding::UTF_8, t[0].value.encoding)
+      end
+    end
+
     if RUBY_VERSION >= "3.3"
       def test_lex_compat
         source = "foo bar"

From 76a1201ccb05f381df2cc9f2183a9b5193d72e0b Mon Sep 17 00:00:00 2001
From: Earlopain <14981592+Earlopain@users.noreply.github.com>
Date: Fri, 8 May 2026 21:10:27 +0200
Subject: [PATCH 2/2] Take the strings encoding as the initial encoding in the
 ripper translator

When no magic encoding comment is present, it does not default to utf-8,
and takes the encoding of the string that contains the source code instead.
Most of the time that will be utf-8, but not always.
---
 lib/prism/translation/ripper.rb | 5 +++--
 test/prism/ruby/ripper_test.rb  | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb
index ddcec997b9..f179a149a1 100644
--- a/lib/prism/translation/ripper.rb
+++ b/lib/prism/translation/ripper.rb
@@ -57,7 +57,8 @@ def self.parse(src, filename = "(ripper)", lineno = 1)
       #          [[1, 13], :on_kw,     "end", END      ]]
       #
       def self.lex(src, filename = "-", lineno = 1, raise_errors: false)
-        result = Prism.lex_compat(coerce_source(src), filepath: filename, line: lineno, version: "current")
+        coerced = coerce_source(src)
+        result = Prism.lex_compat(coerced, filepath: filename, line: lineno, version: "current", encoding: coerced.encoding)
 
         if result.failure? && raise_errors
           raise SyntaxError, result.errors.first.message
@@ -4077,7 +4078,7 @@ def visit_yield_node(node)
 
       # Lazily initialize the parse result.
       def result
-        @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true)
+        @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true, encoding: source.encoding)
       end
 
       def line_and_column_cache
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
index 05be087868..4fff630561 100644
--- a/test/prism/ruby/ripper_test.rb
+++ b/test/prism/ruby/ripper_test.rb
@@ -224,6 +224,12 @@ def test_tokenize
       assert_equal(Ripper.tokenize(source), Translation::Ripper.tokenize(source))
     end
 
+    def test_encoding
+      source = '"わたし"'.encode(Encoding::Windows_31J)
+      assert_equal(Ripper.tokenize(source), Translation::Ripper.tokenize(source))
+      assert_equal(Ripper.sexp(source), Translation::Ripper.sexp(source))
+    end
+
     def test_sexp_coercion
       string_like = Object.new
       def string_like.to_str