From dedb010dd77432531bc5bcc67854b2936a88298e Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Fri, 8 May 2026 21:04:08 +0200 Subject: [PATCH 1/2] Respect `encoding` option in `Prism.lex` and friends utf-8 is the default for source files but can be overwritten via options --- ext/prism/extension.c | 2 +- test/prism/lex_test.rb | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/ext/prism/extension.c b/ext/prism/extension.c index 9f9169cfff..27df8dac50 100644 --- a/ext/prism/extension.c +++ b/ext/prism/extension.c @@ -793,7 +793,7 @@ parse_lex_input(const uint8_t *input, size_t input_length, const pm_options_t *o parse_lex_data_t parse_lex_data = { .source = source, .tokens = rb_ary_new(), - .encoding = rb_utf8_encoding(), + .encoding = rb_enc_find(pm_parser_encoding_name(parser)), .freeze = pm_options_freeze(options), }; diff --git a/test/prism/lex_test.rb b/test/prism/lex_test.rb index 8ea7ce7e9b..1e06d52184 100644 --- a/test/prism/lex_test.rb +++ b/test/prism/lex_test.rb @@ -47,6 +47,24 @@ def test_parse_lex_file end end + def test_lex_encoding + tokens = Prism.lex('"わたし"', encoding: Encoding::Windows_31J).value + tokens.each do |t| + assert_equal(Encoding::Windows_31J, t[0].value.encoding) + end + + # Shebangs must appear on the first line. For these cases, the encoding + # comment may appear second, but it should still change encoding. + tokens = Prism.lex(<<~RUBY, encoding: Encoding::Windows_31J).value + #! /usr/bin/env ruby + # encoding: utf-8 + "わたし" + RUBY + tokens.each do |t| + assert_equal(Encoding::UTF_8, t[0].value.encoding) + end + end + if RUBY_VERSION >= "3.3" def test_lex_compat source = "foo bar" From 76a1201ccb05f381df2cc9f2183a9b5193d72e0b Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Fri, 8 May 2026 21:10:27 +0200 Subject: [PATCH 2/2] Take the strings encoding as the initial encoding in the ripper translator When no magic encoding comment is present, it does not default to utf-8, and takes the encoding of the string that contains the source code instead. Most of the time that will be utf-8, but not always. --- lib/prism/translation/ripper.rb | 5 +++-- test/prism/ruby/ripper_test.rb | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb index ddcec997b9..f179a149a1 100644 --- a/lib/prism/translation/ripper.rb +++ b/lib/prism/translation/ripper.rb @@ -57,7 +57,8 @@ def self.parse(src, filename = "(ripper)", lineno = 1) # [[1, 13], :on_kw, "end", END ]] # def self.lex(src, filename = "-", lineno = 1, raise_errors: false) - result = Prism.lex_compat(coerce_source(src), filepath: filename, line: lineno, version: "current") + coerced = coerce_source(src) + result = Prism.lex_compat(coerced, filepath: filename, line: lineno, version: "current", encoding: coerced.encoding) if result.failure? && raise_errors raise SyntaxError, result.errors.first.message @@ -4077,7 +4078,7 @@ def visit_yield_node(node) # Lazily initialize the parse result. def result - @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true) + @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true, encoding: source.encoding) end def line_and_column_cache diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb index 05be087868..4fff630561 100644 --- a/test/prism/ruby/ripper_test.rb +++ b/test/prism/ruby/ripper_test.rb @@ -224,6 +224,12 @@ def test_tokenize assert_equal(Ripper.tokenize(source), Translation::Ripper.tokenize(source)) end + def test_encoding + source = '"わたし"'.encode(Encoding::Windows_31J) + assert_equal(Ripper.tokenize(source), Translation::Ripper.tokenize(source)) + assert_equal(Ripper.sexp(source), Translation::Ripper.sexp(source)) + end + def test_sexp_coercion string_like = Object.new def string_like.to_str