From 06caa59cc3b2787ade201a5ecbe3339930e85105 Mon Sep 17 00:00:00 2001 From: Matt Valentine-House Date: Thu, 7 May 2026 14:24:58 +0100 Subject: [PATCH 01/12] [ruby/mmtk] Introduce support for ractor_belonging. This is a debug mode in Ruby where an extra word is used after each object to store the address of the Ractor that owns the object, used for debug purposes only. While we're working on Ractors, we also need to be able to test with MMTk enabled, so we should introduce support for this to the MMTk binding as well. As implemented we'll default the binding options to have everything disabled and hardcoded to 0, as was always the case, but if RACTOR_CHECK_MODE is enabled, we'll build and pass a valid RubyBinding object to MMTk. https://github.com/ruby/mmtk/commit/83cb291313 --- gc/mmtk/mmtk.c | 31 ++++++++++++++++++++++++++++--- gc/mmtk/mmtk.h | 2 +- gc/mmtk/src/api.rs | 12 ++++++++---- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/gc/mmtk/mmtk.c b/gc/mmtk/mmtk.c index e4cd71925c7ae6..9b1aed4e5bd1f9 100644 --- a/gc/mmtk/mmtk.c +++ b/gc/mmtk/mmtk.c @@ -16,6 +16,22 @@ #include #endif +#ifndef VM_CHECK_MODE +# define VM_CHECK_MODE RUBY_DEBUG +#endif + +// From ractor_core.h +#ifndef RACTOR_CHECK_MODE +# define RACTOR_CHECK_MODE (VM_CHECK_MODE || RUBY_DEBUG) && (SIZEOF_UINT64_T == SIZEOF_VALUE) +#endif + +#if RACTOR_CHECK_MODE +# define RVALUE_SUFFIX_SIZE sizeof(VALUE) +void rb_ractor_setup_belonging(VALUE obj); +#else +# define RVALUE_SUFFIX_SIZE 0 +#endif + struct objspace { bool measure_gc_time; bool gc_stress; @@ -557,7 +573,11 @@ void * rb_gc_impl_objspace_alloc(void) { MMTk_Builder *builder = rb_mmtk_builder_init(); - mmtk_init_binding(builder, NULL, &ruby_upcalls); + MMTk_RubyBindingOptions binding_options = { + .ractor_check_mode = RACTOR_CHECK_MODE != 0, + .suffix_size = RVALUE_SUFFIX_SIZE, + }; + mmtk_init_binding(builder, &binding_options, &ruby_upcalls); return calloc(1, sizeof(struct objspace)); } @@ -885,7 +905,8 @@ rb_gc_impl_new_obj(void *objspace_ptr, void *cache_ptr, VALUE klass, VALUE flags mmtk_handle_user_collection_request(ractor_cache, false, false); } - alloc_size += sizeof(VALUE); + // Layout: [hidden size header (sizeof(VALUE))][payload (alloc_size)][suffix (RVALUE_SUFFIX_SIZE)] + alloc_size += sizeof(VALUE) + RVALUE_SUFFIX_SIZE; VALUE *alloc_obj = (VALUE *)rb_mmtk_alloc_fast_path(objspace, ractor_cache, alloc_size); if (!alloc_obj) { @@ -893,7 +914,7 @@ rb_gc_impl_new_obj(void *objspace_ptr, void *cache_ptr, VALUE klass, VALUE flags } alloc_obj++; - alloc_obj[-1] = alloc_size - sizeof(VALUE); + alloc_obj[-1] = alloc_size - sizeof(VALUE) - RVALUE_SUFFIX_SIZE; alloc_obj[0] = flags; alloc_obj[1] = klass; @@ -905,6 +926,10 @@ rb_gc_impl_new_obj(void *objspace_ptr, void *cache_ptr, VALUE klass, VALUE flags objspace->total_allocated_objects++; +#if RACTOR_CHECK_MODE + rb_ractor_setup_belonging((VALUE)alloc_obj); +#endif + return (VALUE)alloc_obj; } diff --git a/gc/mmtk/mmtk.h b/gc/mmtk/mmtk.h index ee338c87efe15e..e8f95920ddcaf5 100644 --- a/gc/mmtk/mmtk.h +++ b/gc/mmtk/mmtk.h @@ -95,7 +95,7 @@ bool mmtk_is_reachable(MMTk_ObjectReference object); MMTk_Builder *mmtk_builder_default(void); void mmtk_init_binding(MMTk_Builder *builder, - const struct MMTk_RubyBindingOptions *_binding_options, + const struct MMTk_RubyBindingOptions *binding_options, const struct MMTk_RubyUpcalls *upcalls); void mmtk_initialize_collection(MMTk_VMThread tls); diff --git a/gc/mmtk/src/api.rs b/gc/mmtk/src/api.rs index b9797f6fe2df6f..0c73cd74ebc345 100644 --- a/gc/mmtk/src/api.rs +++ b/gc/mmtk/src/api.rs @@ -181,7 +181,7 @@ pub extern "C" fn mmtk_builder_default() -> *mut MMTKBuilder { #[no_mangle] pub unsafe extern "C" fn mmtk_init_binding( builder: *mut MMTKBuilder, - _binding_options: *const RubyBindingOptions, + binding_options: *const RubyBindingOptions, upcalls: *const RubyUpcalls, ) { crate::MUTATOR_THREAD_PANIC_HANDLER @@ -191,9 +191,13 @@ pub unsafe extern "C" fn mmtk_init_binding( crate::set_panic_hook(); let builder: Box = unsafe { Box::from_raw(builder) }; - let binding_options = RubyBindingOptions { - ractor_check_mode: false, - suffix_size: 0, + let binding_options = if binding_options.is_null() { + RubyBindingOptions { + ractor_check_mode: false, + suffix_size: 0, + } + } else { + unsafe { (*binding_options).clone() } }; let mmtk_boxed = mmtk_init(&builder); let mmtk_static = Box::leak(Box::new(mmtk_boxed)); From 33744d25cfdd056552ab9a464ac250d56dfaaf2c Mon Sep 17 00:00:00 2001 From: Matt Valentine-House Date: Thu, 7 May 2026 15:19:34 +0100 Subject: [PATCH 02/12] [ruby/mmtk] Remove unnecessary null check. the only caller of this unconditionally constructs a binding options object now, So actually this is dead code https://github.com/ruby/mmtk/commit/d832004e89 --- gc/mmtk/src/api.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/gc/mmtk/src/api.rs b/gc/mmtk/src/api.rs index 0c73cd74ebc345..1519d2b6237761 100644 --- a/gc/mmtk/src/api.rs +++ b/gc/mmtk/src/api.rs @@ -191,14 +191,7 @@ pub unsafe extern "C" fn mmtk_init_binding( crate::set_panic_hook(); let builder: Box = unsafe { Box::from_raw(builder) }; - let binding_options = if binding_options.is_null() { - RubyBindingOptions { - ractor_check_mode: false, - suffix_size: 0, - } - } else { - unsafe { (*binding_options).clone() } - }; + let binding_options = unsafe { (*binding_options).clone() }; let mmtk_boxed = mmtk_init(&builder); let mmtk_static = Box::leak(Box::new(mmtk_boxed)); From 4a0072d5f29befde814ea0d9a83c711e1f049564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chris=20Hasi=C5=84ski?= Date: Fri, 8 May 2026 19:10:12 +0200 Subject: [PATCH 03/12] Speed up Integer#to_s with a two digit lookup table (#16719) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * numeric: emit two decimal digits per iteration in rb_fix2str Replace the digit-at-a-time loop in rb_fix2str with the standard itoa 2-digit lookup table for base 10. Each iteration now writes two digits using a single (u % 100, u / 100) pair, so the number of loop iterations is halved for multi-digit integers. The classic per-digit loop is kept for non-base-10 conversion. Benchmark (Apple M-series, 5M-10M ops, best of 3 runs): case base patch delta --------- ----- ----- ----- 1-digit (5) 64 ns/op 64 ns/op -0% 2-digit (42) 64 ns/op 65 ns/op +2% (noise) 3-digit (400) 66 ns/op 64 ns/op -3% 5-digit (12345) 69 ns/op 67 ns/op -3% 10-digit (1234567890) 77 ns/op 67 ns/op -13% 19-digit (2^62-1) 111 ns/op 75 ns/op -33% The crossover is at ~3 digits: below that the constant setup dominates and the benefit is within noise, above that the halved iteration count shows up linearly. Typical Rails payloads mix short IDs (1-5 digits) and longer values (timestamps, nanos, large counts), so the win is workload-dependent but strictly non-negative for real code. Correctness: 100k random fuzz across the full fixnum range plus targeted edges (0, ±1, ±99, ±100, 2^30-1, 2^62-1, etc.) all pass. make test-all shows 34694 tests, 7325860 assertions, 0 new failures (same pre-existing TestArgf#test_puts flake as on master) — test_integer.rb alone runs 38 tests / 421628 assertions of which Integer#to_s exercises the bulk, all pass. The 200-byte lookup table sits in .rodata and fits in a single cache line of its own (3 lines for the whole table). No change to public API, no change to bignum conversion, no change to non-base-10 conversion paths. * bignum: emit two decimal digits per iteration in big2str_2bdigits Extend the 2-digit lookup-table itoa optimisation from rb_fix2str to the inner conversion loop used by Bignum#to_s. big2str_2bdigits has two code paths — a leading-chunk path that emits variable-length digits, and a recursive-chunk path that emits a fixed-width zero- padded block — and both gain from the halved division count. The classic per-digit loop is preserved for non-base-10 conversion. Moves the ruby_decimal_digit_pairs table from a file-static in numeric.c to bignum.c next to ruby_digitmap, and exposes it through internal/bignum.h so both files share the same 200-byte .rodata instance. Benchmark (Apple M-series, best of 3 runs, measures bignum-only speedup against the preceding fixnum commit): case base patch delta --------- ----- ----- ----- big_20dig 10^19+... 146 ns/op 124 ns/op -15% big_40dig 10^39+... 174 ns/op 152 ns/op -13% big_100dig 10^99+42 236 ns/op 213 ns/op -10% big_500dig 10^499+7 1119 ns/op 1086 ns/op -3% big_1000dig 10^999 3490 ns/op 3459 ns/op -1% fix_19dig 2^62-1 76 ns/op 76 ns/op 0% (unchanged path) Wins concentrate in the 20-100 digit range where big2str_2bdigits is the dominant cost. Above ~500 digits the Karatsuba divmod recursion dominates and the digit-emission saving shrinks to the noise floor. The 20-100 range is what actual Ruby code exercises (financial high-precision sums, nanosecond timestamps, large counters); crypto-size (1000+ digit) bignums are rare in to_s paths. Correctness: 100k random fixnum fuzz unchanged, 500 random bignum fuzz up to 2^256 with cross-check against sprintf("%d"), bases 2/8/16/36 round-trip, plus edge cases (0, just-above-fixnum, ±2^100, 20-digit strings near the fixnum boundary). test/ruby/test_integer.rb stays at 38 tests / 421628 assertions / 0 failures, test_bignum.rb passes 74 / 607 / 0 failures, full make test-all reports 34694 tests / 0 new failures (same TestArgf#test_puts pre-existing flake as master). * benchmark: add int_to_s yaml for Integer#to_s Reproducible benchmark for the two preceding commits. Covers: - 1/2/3/5/10/19-digit positive fixnums (spans the break-even point and the two large-number wins at the top) - A negative fixnum (exercises the minus-sign prepend path) - 20/40/100-digit bignums (spans the big2str_2bdigits win range) - Two string-interpolation scenarios, so reviewers can see how much of the Integer#to_s speedup reaches real code that allocates the result string too Intended to be consumed by benchmark-driver against master vs int-to-s-twodigit for A/B comparison. Matches the numbers in the commit messages of 5bfb7e02a2 and c5df6de835. --------- Co-authored-by: tomoya ishida --- benchmark/int_to_s.yml | 25 ++++++++++++ bignum.c | 86 +++++++++++++++++++++++++++++++++++++----- internal/bignum.h | 1 + numeric.c | 36 ++++++++++++++++-- 4 files changed, 135 insertions(+), 13 deletions(-) create mode 100644 benchmark/int_to_s.yml diff --git a/benchmark/int_to_s.yml b/benchmark/int_to_s.yml new file mode 100644 index 00000000000000..000dae9612ec54 --- /dev/null +++ b/benchmark/int_to_s.yml @@ -0,0 +1,25 @@ +prelude: | + # frozen_string_literal: true + N1 = 5 + N2 = 42 + N3 = 400 + N5 = 12345 + N10 = 1_234_567_890 + N19 = 4_611_686_018_427_387_903 + NEG = -1_234_567_890 + BIG20 = 10 ** 19 + 12_345_678_901_234_567 + BIG40 = 10 ** 39 + 123_456_789_012_345 + BIG100 = 10 ** 99 + 42 +benchmark: + fix_1digit: "N1.to_s" + fix_2digit: "N2.to_s" + fix_3digit: "N3.to_s" + fix_5digit: "N5.to_s" + fix_10digit: "N10.to_s" + fix_19digit: "N19.to_s" + fix_negative: "NEG.to_s" + big_20digit: "BIG20.to_s" + big_40digit: "BIG40.to_s" + big_100digit: "BIG100.to_s" + interp_id: '"id=#{N10}"' + interp_mixed: '"a=#{N2},b=#{N5},c=#{N10}"' diff --git a/bignum.c b/bignum.c index e4af035caccedd..28924b4eb9cd09 100644 --- a/bignum.c +++ b/bignum.c @@ -64,6 +64,21 @@ static const bool debug_integer_pack = ( const char ruby_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +/* Two-digit decimal lookup table. Offset 2*n holds the ASCII pair for + * n in the range 0..99. Used by both rb_fix2str in numeric.c and + * big2str_2bdigits below to emit two base-10 digits per iteration. */ +const char ruby_decimal_digit_pairs[201] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + #ifndef SIZEOF_BDIGIT_DBL # if SIZEOF_INT*2 <= SIZEOF_LONG_LONG # define SIZEOF_BDIGIT_DBL SIZEOF_LONG_LONG @@ -4811,11 +4826,34 @@ big2str_2bdigits(struct big2str_struct *b2s, BDIGIT *xds, size_t xn, size_t tail return; p = buf; j = sizeof(buf); - do { - BDIGIT_DBL idx = num % b2s->base; - num /= b2s->base; - p[--j] = ruby_digitmap[idx]; - } while (num); + if (b2s->base == 10) { + /* Emit two decimal digits per iteration from ruby_decimal_digit_pairs. + * See the comment on the table in bignum.c near ruby_digitmap. */ + while (num >= 100) { + BDIGIT_DBL idx = (num % 100) * 2; + num /= 100; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + if (num >= 10) { + BDIGIT_DBL idx = num * 2; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + else { + /* num is 1..9 here (0 was handled above) */ + p[--j] = (char)('0' + num); + } + } + else { + do { + BDIGIT_DBL idx = num % b2s->base; + num /= b2s->base; + p[--j] = ruby_digitmap[idx]; + } while (num); + } len = sizeof(buf) - j; big2str_alloc(b2s, len + taillen); MEMCPY(b2s->ptr, buf + j, char, len); @@ -4823,11 +4861,39 @@ big2str_2bdigits(struct big2str_struct *b2s, BDIGIT *xds, size_t xn, size_t tail else { p = b2s->ptr; j = b2s->hbase2_numdigits; - do { - BDIGIT_DBL idx = num % b2s->base; - num /= b2s->base; - p[--j] = ruby_digitmap[idx]; - } while (j); + if (b2s->base == 10) { + /* Non-beginning chunks must emit EXACTLY hbase2_numdigits, + * zero-padded on the left. Consume num in 2-digit groups, + * handle the odd trailing digit, then memset remaining + * positions with '0'. */ + while (num >= 100) { + BDIGIT_DBL idx = (num % 100) * 2; + num /= 100; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + if (num >= 10) { + BDIGIT_DBL idx = num * 2; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + else if (num > 0) { + p[--j] = (char)('0' + num); + } + if (j > 0) { + memset(p, '0', j); + j = 0; + } + } + else { + do { + BDIGIT_DBL idx = num % b2s->base; + num /= b2s->base; + p[--j] = ruby_digitmap[idx]; + } while (j); + } len = b2s->hbase2_numdigits; } b2s->ptr += len; diff --git a/internal/bignum.h b/internal/bignum.h index f11fbd3a4d096a..7389a17c747e15 100644 --- a/internal/bignum.h +++ b/internal/bignum.h @@ -107,6 +107,7 @@ struct RBignum { /* bignum.c */ extern const char ruby_digitmap[]; +extern const char ruby_decimal_digit_pairs[]; double rb_big_fdiv_double(VALUE x, VALUE y); VALUE rb_big_uminus(VALUE x); VALUE rb_big_hash(VALUE); diff --git a/numeric.c b/numeric.c index 40b7bfc0f8e2b4..175bd7cfa0f730 100644 --- a/numeric.c +++ b/numeric.c @@ -4040,6 +4040,11 @@ rb_int_uminus(VALUE num) } } +/* ruby_decimal_digit_pairs is defined in bignum.c and declared in + * internal/bignum.h. See there for the rationale of the 2-digit + * lookup-table itoa optimisation; both rb_fix2str here and big2str_2bdigits + * in bignum.c consume it. */ + VALUE rb_fix2str(VALUE x, int base) { @@ -4072,9 +4077,34 @@ rb_fix2str(VALUE x, int base) else { u = val; } - do { - *--b = ruby_digitmap[(int)(u % base)]; - } while (u /= base); + if (base == 10) { + /* Emit two digits per iteration from a precomputed table. The + * compiler lowers `u % 100` and `u / 100` to a single multiply + + * shift, so each iteration costs roughly one multiply, one shift, + * and two stores. About 2x fewer iterations than the classic + * per-digit loop for multi-digit inputs. */ + while (u >= 100) { + unsigned long idx = (u % 100) * 2; + u /= 100; + b -= 2; + b[0] = ruby_decimal_digit_pairs[idx]; + b[1] = ruby_decimal_digit_pairs[idx + 1]; + } + if (u >= 10) { + unsigned long idx = u * 2; + b -= 2; + b[0] = ruby_decimal_digit_pairs[idx]; + b[1] = ruby_decimal_digit_pairs[idx + 1]; + } + else { + *--b = (char)('0' + u); + } + } + else { + do { + *--b = ruby_digitmap[(int)(u % base)]; + } while (u /= base); + } if (neg) { *--b = '-'; } From 97aa28abab6dc65e2aa0373796546d4ebf2df717 Mon Sep 17 00:00:00 2001 From: Steven Webb Date: Sat, 9 May 2026 02:27:05 +0800 Subject: [PATCH 04/12] Fix gdb rb_ps helper (#16896) Over time the .gdbinit initializer has drifted from the codebase and the rb_ps helper no longer works. This PR fixes it. The changes that caused it to break were: * 226f37059ec5f3ea3a1417e0bab630c64dbc8ac3 renamed cfp->iseq to cfp->_iseq. * 6c24904a690eb7c4e20c3fa8c3751acc03454100 switched from storing the last_id to storing the next_id. * f7ae32ed3b5b93247f9f62a58e3dd129098d0b27 removed ID_ENTRY_SIZE. --- .gdbinit | 15 ++++++++------- vm_core.h | 2 +- vm_insnhelper.h | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.gdbinit b/.gdbinit index bda544c64136ff..0d585beef9eeaa 100644 --- a/.gdbinit +++ b/.gdbinit @@ -979,7 +979,7 @@ end define print_lineno set $cfp = $arg0 - set $iseq = $cfp->iseq + set $iseq = rb_get_cfp_iseq($cfp) set $pos = $cfp->pc - $iseq->body->iseq_encoded if $pos != 0 set $pos = $pos - 1 @@ -1060,7 +1060,7 @@ define print_id else set $serial = (rb_id_serial_t)$id end - if $serial && $serial <= ruby_global_symbols.last_id + if $serial && $serial < ruby_global_symbols.next_id set $idx = $serial / ID_ENTRY_UNIT set $ids = (struct RArray *)ruby_global_symbols.ids set $flags = $ids->basic.flags @@ -1083,7 +1083,7 @@ define print_id set $aryptr = $ary->as.heap.ptr set $arylen = $ary->as.heap.len end - set $result = $aryptr[($serial % ID_ENTRY_UNIT) * ID_ENTRY_SIZE + $t] + set $result = $aryptr[($serial % ID_ENTRY_UNIT) + $t] if $result != RUBY_Qnil print_string $result else @@ -1117,16 +1117,17 @@ define rb_ps_thread set $cfp = $ps_thread_th->ec->cfp set $cfpend = (rb_control_frame_t *)($ps_thread_th->ec->vm_stack + $ps_thread_th->ec->vm_stack_size)-1 while $cfp < $cfpend - if $cfp->iseq - if !((VALUE)$cfp->iseq & RUBY_IMMEDIATE_MASK) && (((imemo_ifunc << RUBY_FL_USHIFT) | RUBY_T_IMEMO)==$cfp->iseq->flags & ((RUBY_IMEMO_MASK << RUBY_FL_USHIFT) | RUBY_T_MASK)) + if $cfp->_iseq + set $iseq = rb_get_cfp_iseq($cfp) + if !((VALUE)$iseq & RUBY_IMMEDIATE_MASK) && (((imemo_ifunc << RUBY_FL_USHIFT) | RUBY_T_IMEMO)==$iseq->flags & ((RUBY_IMEMO_MASK << RUBY_FL_USHIFT) | RUBY_T_MASK)) printf "%d:ifunc ", $cfpend-$cfp set print symbol-filename on - output/a $cfp->iseq.body + output/a $iseq.body set print symbol-filename off printf "\n" else if $cfp->pc - set $location = $cfp->iseq->body->location + set $location = $iseq->body->location printf "%d:", $cfpend-$cfp print_pathobj $location.pathobj printf ":" diff --git a/vm_core.h b/vm_core.h index 89f80b52c75a37..1e3dcfe04f21ac 100644 --- a/vm_core.h +++ b/vm_core.h @@ -920,7 +920,7 @@ struct rb_block { typedef struct rb_control_frame_struct { const VALUE *pc; // cfp[0] VALUE *sp; // cfp[1] - const rb_iseq_t *_iseq; // cfp[2] -- use rb_cfp_iseq(cfp) to read + const rb_iseq_t *_iseq; // cfp[2] -- use CFP_ISEQ(cfp) to read VALUE self; // cfp[3] / block[0] const VALUE *ep; // cfp[4] / block[1] const void *block_code; // cfp[5] / block[2] -- iseq, ifunc, or forwarded block handler diff --git a/vm_insnhelper.h b/vm_insnhelper.h index 88c387ee152afa..2d83fb5897a376 100644 --- a/vm_insnhelper.h +++ b/vm_insnhelper.h @@ -116,7 +116,7 @@ enum vm_regan_acttype { // instruction sequence C struct // Uses cfp->_iseq directly because the interpreter always has a valid _iseq // field (it's written on exit from JIT code). Code in vm_insnhelper.c that -// may be called as a ZJIT fallback should use rb_cfp_iseq() instead. +// may be called as a ZJIT fallback should use CFP_ISEQ() instead. #define GET_ISEQ() (GET_CFP()->_iseq) /**********************************************************/ From 3a5bfb8a144cbc1d8021fd3709e007fc6f5b1261 Mon Sep 17 00:00:00 2001 From: Max Bernstein Date: Fri, 8 May 2026 16:00:32 -0400 Subject: [PATCH 05/12] ZJIT: Use Insn::for_each_operand_mut in Function::find (#16902) ZJIT: Use for_each_operand_mut in Function::find No need to repeat this matching logic manually. --- zjit/src/hir.rs | 241 +----------------------------------------------- 1 file changed, 5 insertions(+), 236 deletions(-) diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index 4d006af1ab2c96..27bf5df42741ee 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -2783,243 +2783,12 @@ impl Function { } }; } - macro_rules! find_vec { - ( $x:expr ) => { - { - $x.iter().map(|arg| find!(*arg)).collect() - } - }; - } - macro_rules! find_branch_edge { - ( $edge:ident ) => { - { - BranchEdge { - target: $edge.target, - args: find_vec!($edge.args), - } - } - }; - } let insn_id = find!(insn_id); - use Insn::*; - match &self.insns[insn_id.0] { - result@(Const {..} - | Param - | LoadArg {..} - | Entries {..} - | GetConstantPath {..} - | PatchPoint {..} - | PutSpecialObject {..} - | GetGlobal {..} - | SideExit {..} - | EntryPoint {..} - | LoadPC - | LoadSP - | LoadEC - | GetEP {..} - | LoadSelf - | BreakPoint - | IncrCounterPtr {..} - | IncrCounter(_)) => result.clone(), - &Snapshot { state: FrameState { iseq, insn_idx, pc, ref stack, ref locals } } => - Snapshot { - state: FrameState { - iseq, - insn_idx, - pc, - stack: find_vec!(stack), - locals: find_vec!(locals), - } - }, - &Return { val } => Return { val: find!(val) }, - &FixnumBitCheck { val, index } => FixnumBitCheck { val: find!(val), index }, - &Throw { throw_state, val, state } => Throw { throw_state, val: find!(val), state }, - &StringCopy { val, chilled, state } => StringCopy { val: find!(val), chilled, state }, - &StringIntern { val, state } => StringIntern { val: find!(val), state: find!(state) }, - &StringConcat { ref strings, state } => StringConcat { strings: find_vec!(strings), state: find!(state) }, - &StringGetbyte { string, index } => StringGetbyte { string: find!(string), index: find!(index) }, - &StringSetbyteFixnum { string, index, value } => StringSetbyteFixnum { string: find!(string), index: find!(index), value: find!(value) }, - &StringAppend { recv, other, state } => StringAppend { recv: find!(recv), other: find!(other), state: find!(state) }, - &StringAppendCodepoint { recv, other, state } => StringAppendCodepoint { recv: find!(recv), other: find!(other), state: find!(state) }, - &StringEqual { left, right } => StringEqual { left: find!(left), right: find!(right) }, - &ToRegexp { opt, ref values, state } => ToRegexp { opt, values: find_vec!(values), state }, - &Test { val } => Test { val: find!(val) }, - &IsNil { val } => IsNil { val: find!(val) }, - &IsMethodCfunc { val, cd, cfunc, state } => IsMethodCfunc { val: find!(val), cd, cfunc, state }, - &IsBitEqual { left, right } => IsBitEqual { left: find!(left), right: find!(right) }, - &IsBitNotEqual { left, right } => IsBitNotEqual { left: find!(left), right: find!(right) }, - &BoxBool { val } => BoxBool { val: find!(val) }, - &BoxFixnum { val, state } => BoxFixnum { val: find!(val), state: find!(state) }, - &UnboxFixnum { val } => UnboxFixnum { val: find!(val) }, - &FixnumAref { recv, index } => FixnumAref { recv: find!(recv), index: find!(index) }, - Jump(target) => Jump(find_branch_edge!(target)), - &IfTrue { val, ref target } => IfTrue { val: find!(val), target: find_branch_edge!(target) }, - &IfFalse { val, ref target } => IfFalse { val: find!(val), target: find_branch_edge!(target) }, - &RefineType { val, new_type } => RefineType { val: find!(val), new_type }, - &HasType { val, expected } => HasType { val: find!(val), expected }, - &GuardType { val, guard_type, state } => GuardType { val: find!(val), guard_type, state }, - &GuardTypeNot { val, guard_type, state } => GuardTypeNot { val: find!(val), guard_type, state }, - &GuardBitEquals { val, expected, reason, state, recompile } => GuardBitEquals { val: find!(val), expected, reason, state, recompile }, - &GuardAnyBitSet { val, mask, mask_name, reason, state } => GuardAnyBitSet { val: find!(val), mask, mask_name, reason, state }, - &GuardNoBitsSet { val, mask, mask_name, reason, state } => GuardNoBitsSet { val: find!(val), mask, mask_name, reason, state }, - &GuardGreaterEq { left, right, reason, state } => GuardGreaterEq { left: find!(left), right: find!(right), reason, state }, - &GuardLess { left, right, state } => GuardLess { left: find!(left), right: find!(right), state }, - &IsBlockGiven { lep } => IsBlockGiven { lep: find!(lep) }, - &IsBlockParamModified { flags } => IsBlockParamModified { flags: find!(flags) }, - &GetBlockParam { level, ep_offset, state } => GetBlockParam { level, ep_offset, state: find!(state) }, - &FixnumAdd { left, right, state } => FixnumAdd { left: find!(left), right: find!(right), state }, - &FixnumSub { left, right, state } => FixnumSub { left: find!(left), right: find!(right), state }, - &FixnumMult { left, right, state } => FixnumMult { left: find!(left), right: find!(right), state }, - &FixnumDiv { left, right, state } => FixnumDiv { left: find!(left), right: find!(right), state }, - &FixnumMod { left, right, state } => FixnumMod { left: find!(left), right: find!(right), state }, - &FloatAdd { recv, other, state } => FloatAdd { recv: find!(recv), other: find!(other), state }, - &FloatSub { recv, other, state } => FloatSub { recv: find!(recv), other: find!(other), state }, - &FloatMul { recv, other, state } => FloatMul { recv: find!(recv), other: find!(other), state }, - &FloatDiv { recv, other, state } => FloatDiv { recv: find!(recv), other: find!(other), state }, - &FloatToInt { recv, state } => FloatToInt { recv: find!(recv), state }, - &FixnumNeq { left, right } => FixnumNeq { left: find!(left), right: find!(right) }, - &FixnumEq { left, right } => FixnumEq { left: find!(left), right: find!(right) }, - &FixnumGt { left, right } => FixnumGt { left: find!(left), right: find!(right) }, - &FixnumGe { left, right } => FixnumGe { left: find!(left), right: find!(right) }, - &FixnumLt { left, right } => FixnumLt { left: find!(left), right: find!(right) }, - &FixnumLe { left, right } => FixnumLe { left: find!(left), right: find!(right) }, - &FixnumAnd { left, right } => FixnumAnd { left: find!(left), right: find!(right) }, - &FixnumOr { left, right } => FixnumOr { left: find!(left), right: find!(right) }, - &FixnumXor { left, right } => FixnumXor { left: find!(left), right: find!(right) }, - &IntAnd { left, right } => IntAnd { left: find!(left), right: find!(right) }, - &IntOr { left, right } => IntOr { left: find!(left), right: find!(right) }, - &FixnumLShift { left, right, state } => FixnumLShift { left: find!(left), right: find!(right), state }, - &FixnumRShift { left, right } => FixnumRShift { left: find!(left), right: find!(right) }, - &ObjToString { val, cd, state } => ObjToString { - val: find!(val), - cd, - state, - }, - &AnyToString { val, str, state } => AnyToString { - val: find!(val), - str: find!(str), - state, - }, - &SendDirect { recv, cd, cme, iseq, ref args, kw_bits, block, state } => SendDirect { - recv: find!(recv), - cd, - cme, - iseq, - args: find_vec!(args), - kw_bits, - block, - state, - }, - &Send { recv, cd, block, ref args, state, reason } => Send { - recv: find!(recv), - cd, - block, - args: find_vec!(args), - state, - reason, - }, - &SendForward { recv, cd, blockiseq, ref args, state, reason } => SendForward { - recv: find!(recv), - cd, - blockiseq, - args: find_vec!(args), - state, - reason, - }, - &InvokeSuper { recv, cd, blockiseq, ref args, state, reason } => InvokeSuper { - recv: find!(recv), - cd, - blockiseq, - args: find_vec!(args), - state, - reason, - }, - &InvokeSuperForward { recv, cd, blockiseq, ref args, state, reason } => InvokeSuperForward { - recv: find!(recv), - cd, - blockiseq, - args: find_vec!(args), - state, - reason, - }, - &InvokeBlock { cd, ref args, state, reason } => InvokeBlock { - cd, - args: find_vec!(args), - state, - reason, - }, - &InvokeBlockIfunc { cd, block_handler, ref args, state } => InvokeBlockIfunc { - cd, - block_handler: find!(block_handler), - args: find_vec!(args), - state: find!(state), - }, - &InvokeProc { recv, ref args, state, kw_splat } => InvokeProc { - recv: find!(recv), - args: find_vec!(args), - state: find!(state), - kw_splat, - }, - &InvokeBuiltin { bf, recv, ref args, state, leaf, return_type } => InvokeBuiltin { bf, recv: find!(recv), args: find_vec!(args), state, leaf, return_type }, - &ArrayDup { val, state } => ArrayDup { val: find!(val), state }, - &HashDup { val, state } => HashDup { val: find!(val), state }, - &HashAref { hash, key, state } => HashAref { hash: find!(hash), key: find!(key), state }, - &HashAset { hash, key, val, state } => HashAset { hash: find!(hash), key: find!(key), val: find!(val), state }, - &ObjectAlloc { val, state } => ObjectAlloc { val: find!(val), state }, - &ObjectAllocClass { class, state } => ObjectAllocClass { class, state: find!(state) }, - &CCall { cfunc, recv, ref args, name, owner, return_type, elidable } => CCall { cfunc, recv: find!(recv), args: find_vec!(args), name, owner, return_type, elidable }, - &CCallWithFrame { cd, cfunc, recv, ref args, cme, name, state, return_type, elidable, block } => CCallWithFrame { - cd, - cfunc, - recv: find!(recv), - args: find_vec!(args), - cme, - name, - state: find!(state), - return_type, - elidable, - block, - }, - &CCallVariadic { cfunc, recv, ref args, cme, name, state, return_type, elidable, block } => CCallVariadic { - cfunc, recv: find!(recv), args: find_vec!(args), cme, name, state, return_type, elidable, block - }, - &CheckMatch { target, pattern, flag, state } => CheckMatch { target: find!(target), pattern: find!(pattern), flag, state: find!(state) }, - &Defined { op_type, obj, pushval, v, lep_level, state } => Defined { op_type, obj, pushval, v: find!(v), lep_level, state: find!(state) }, - &DefinedIvar { self_val, pushval, id, state } => DefinedIvar { self_val: find!(self_val), pushval, id, state }, - &GetConstant { klass, id, allow_nil, state } => GetConstant { klass: find!(klass), id, allow_nil: find!(allow_nil), state }, - &NewArray { ref elements, state } => NewArray { elements: find_vec!(elements), state: find!(state) }, - &NewHash { ref elements, state } => NewHash { elements: find_vec!(elements), state: find!(state) }, - &NewRange { low, high, flag, state } => NewRange { low: find!(low), high: find!(high), flag, state: find!(state) }, - &NewRangeFixnum { low, high, flag, state } => NewRangeFixnum { low: find!(low), high: find!(high), flag, state: find!(state) }, - &ArrayAref { array, index } => ArrayAref { array: find!(array), index: find!(index) }, - &ArrayAset { array, index, val } => ArrayAset { array: find!(array), index: find!(index), val: find!(val) }, - &ArrayPop { array, state } => ArrayPop { array: find!(array), state: find!(state) }, - &ArrayLength { array } => ArrayLength { array: find!(array) }, - &AdjustBounds { index, length } => AdjustBounds { index: find!(index), length: find!(length) }, - &ArrayMax { ref elements, state } => ArrayMax { elements: find_vec!(elements), state: find!(state) }, - &ArrayMin { ref elements, state } => ArrayMin { elements: find_vec!(elements), state: find!(state) }, - &ArrayInclude { ref elements, target, state } => ArrayInclude { elements: find_vec!(elements), target: find!(target), state: find!(state) }, - &ArrayPackBuffer { ref elements, fmt, ref buffer, state } => ArrayPackBuffer { elements: find_vec!(elements), fmt: find!(fmt), buffer: (*buffer).map(|buffer| find!(buffer)), state: find!(state) }, - &DupArrayInclude { ary, target, state } => DupArrayInclude { ary, target: find!(target), state: find!(state) }, - &ArrayHash { ref elements, state } => ArrayHash { elements: find_vec!(elements), state }, - &SetGlobal { id, val, state } => SetGlobal { id, val: find!(val), state }, - &GetIvar { self_val, id, ic, state } => GetIvar { self_val: find!(self_val), id, ic, state }, - &LoadField { recv, id, offset, return_type } => LoadField { recv: find!(recv), id, offset, return_type }, - &StoreField { recv, id, offset, val } => StoreField { recv: find!(recv), id, offset, val: find!(val) }, - &WriteBarrier { recv, val } => WriteBarrier { recv: find!(recv), val: find!(val) }, - &SetIvar { self_val, id, ic, val, state } => SetIvar { self_val: find!(self_val), id, ic, val: find!(val), state }, - &GetClassVar { id, ic, state } => GetClassVar { id, ic, state }, - &SetClassVar { id, val, ic, state } => SetClassVar { id, val: find!(val), ic, state }, - &SetLocal { val, ep_offset, level } => SetLocal { val: find!(val), ep_offset, level }, - &GetSpecialSymbol { symbol_type, state } => GetSpecialSymbol { symbol_type, state }, - &GetSpecialNumber { nth, state } => GetSpecialNumber { nth, state }, - &ToArray { val, state } => ToArray { val: find!(val), state }, - &ToNewArray { val, state } => ToNewArray { val: find!(val), state }, - &ArrayExtend { left, right, state } => ArrayExtend { left: find!(left), right: find!(right), state }, - &ArrayPush { array, val, state } => ArrayPush { array: find!(array), val: find!(val), state }, - &CheckInterrupts { state } => CheckInterrupts { state }, - &IsA { val, class } => IsA { val: find!(val), class: find!(class) }, - } + let mut result = self.insns[insn_id.0].clone(); + result.for_each_operand_mut(&mut |operand: &mut InsnId| { + *operand = find!(*operand); + }); + result } /// Update DynamicSendReason for the instruction at insn_id From dc90c26a103ad62df73464cc1896edbcc90bd0c7 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Fri, 8 May 2026 21:04:08 +0200 Subject: [PATCH 06/12] [ruby/prism] Respect `encoding` option in `Prism.lex` and friends utf-8 is the default for source files but can be overwritten via options https://github.com/ruby/prism/commit/355f451528 --- prism/extension.c | 2 +- test/prism/lex_test.rb | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/prism/extension.c b/prism/extension.c index 9f9169cfff7880..27df8dac50ddff 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -793,7 +793,7 @@ parse_lex_input(const uint8_t *input, size_t input_length, const pm_options_t *o parse_lex_data_t parse_lex_data = { .source = source, .tokens = rb_ary_new(), - .encoding = rb_utf8_encoding(), + .encoding = rb_enc_find(pm_parser_encoding_name(parser)), .freeze = pm_options_freeze(options), }; diff --git a/test/prism/lex_test.rb b/test/prism/lex_test.rb index 8ea7ce7e9b258f..1e06d52184b3c5 100644 --- a/test/prism/lex_test.rb +++ b/test/prism/lex_test.rb @@ -47,6 +47,24 @@ def test_parse_lex_file end end + def test_lex_encoding + tokens = Prism.lex('"わたし"', encoding: Encoding::Windows_31J).value + tokens.each do |t| + assert_equal(Encoding::Windows_31J, t[0].value.encoding) + end + + # Shebangs must appear on the first line. For these cases, the encoding + # comment may appear second, but it should still change encoding. + tokens = Prism.lex(<<~RUBY, encoding: Encoding::Windows_31J).value + #! /usr/bin/env ruby + # encoding: utf-8 + "わたし" + RUBY + tokens.each do |t| + assert_equal(Encoding::UTF_8, t[0].value.encoding) + end + end + if RUBY_VERSION >= "3.3" def test_lex_compat source = "foo bar" From 07ae044b0dd4968b4ef6dd072cc0a2a851d79902 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Fri, 8 May 2026 21:10:27 +0200 Subject: [PATCH 07/12] [ruby/prism] Take the strings encoding as the initial encoding in the ripper translator When no magic encoding comment is present, it does not default to utf-8, and takes the encoding of the string that contains the source code instead. Most of the time that will be utf-8, but not always. https://github.com/ruby/prism/commit/1a273db780 --- lib/prism/translation/ripper.rb | 5 +++-- test/prism/ruby/ripper_test.rb | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb index ddcec997b94efa..f179a149a1df45 100644 --- a/lib/prism/translation/ripper.rb +++ b/lib/prism/translation/ripper.rb @@ -57,7 +57,8 @@ def self.parse(src, filename = "(ripper)", lineno = 1) # [[1, 13], :on_kw, "end", END ]] # def self.lex(src, filename = "-", lineno = 1, raise_errors: false) - result = Prism.lex_compat(coerce_source(src), filepath: filename, line: lineno, version: "current") + coerced = coerce_source(src) + result = Prism.lex_compat(coerced, filepath: filename, line: lineno, version: "current", encoding: coerced.encoding) if result.failure? && raise_errors raise SyntaxError, result.errors.first.message @@ -4077,7 +4078,7 @@ def visit_yield_node(node) # Lazily initialize the parse result. def result - @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true) + @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true, encoding: source.encoding) end def line_and_column_cache diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb index 05be087868d811..4fff630561e7d6 100644 --- a/test/prism/ruby/ripper_test.rb +++ b/test/prism/ruby/ripper_test.rb @@ -224,6 +224,12 @@ def test_tokenize assert_equal(Ripper.tokenize(source), Translation::Ripper.tokenize(source)) end + def test_encoding + source = '"わたし"'.encode(Encoding::Windows_31J) + assert_equal(Ripper.tokenize(source), Translation::Ripper.tokenize(source)) + assert_equal(Ripper.sexp(source), Translation::Ripper.sexp(source)) + end + def test_sexp_coercion string_like = Object.new def string_like.to_str From 3d861274e6388f28a26496c473d605a286e6d3d2 Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Fri, 8 May 2026 10:39:48 -0700 Subject: [PATCH 08/12] Introduce RMATCH_{BEG,END,NREGS} helpers These are internal-only helpers which can be used instead of the RMATCH_REGS struct directly. RMATCH_REGS is just a pointer offset from the RMatch VALUE itself, so this should not significantly affect codegen. The motivation for this is that it's both simpler, and should move us towards being able to replace the storage for RMATCH, and to be able to store the positions embedded instead of in separate malloc memory. --- depend | 17 +++++++ ext/ripper/depend | 2 + internal/re.h | 19 ++++++++ re.c | 120 +++++++++++++++++++--------------------------- string.c | 59 ++++++++++------------- 5 files changed, 113 insertions(+), 104 deletions(-) diff --git a/depend b/depend index b4a7882d8b30f5..a17eb16f758660 100644 --- a/depend +++ b/depend @@ -4878,6 +4878,7 @@ enum.$(OBJEXT): {$(VPATH)}internal/core/rclass.h enum.$(OBJEXT): {$(VPATH)}internal/core/rdata.h enum.$(OBJEXT): {$(VPATH)}internal/core/rfile.h enum.$(OBJEXT): {$(VPATH)}internal/core/rhash.h +enum.$(OBJEXT): {$(VPATH)}internal/core/rmatch.h enum.$(OBJEXT): {$(VPATH)}internal/core/robject.h enum.$(OBJEXT): {$(VPATH)}internal/core/rregexp.h enum.$(OBJEXT): {$(VPATH)}internal/core/rstring.h @@ -4968,6 +4969,8 @@ enum.$(OBJEXT): {$(VPATH)}internal/xmalloc.h enum.$(OBJEXT): {$(VPATH)}missing.h enum.$(OBJEXT): {$(VPATH)}onigmo.h enum.$(OBJEXT): {$(VPATH)}oniguruma.h +enum.$(OBJEXT): {$(VPATH)}re.h +enum.$(OBJEXT): {$(VPATH)}regex.h enum.$(OBJEXT): {$(VPATH)}ruby_assert.h enum.$(OBJEXT): {$(VPATH)}shape.h enum.$(OBJEXT): {$(VPATH)}st.h @@ -8965,6 +8968,7 @@ marshal.$(OBJEXT): {$(VPATH)}internal/core/rclass.h marshal.$(OBJEXT): {$(VPATH)}internal/core/rdata.h marshal.$(OBJEXT): {$(VPATH)}internal/core/rfile.h marshal.$(OBJEXT): {$(VPATH)}internal/core/rhash.h +marshal.$(OBJEXT): {$(VPATH)}internal/core/rmatch.h marshal.$(OBJEXT): {$(VPATH)}internal/core/robject.h marshal.$(OBJEXT): {$(VPATH)}internal/core/rregexp.h marshal.$(OBJEXT): {$(VPATH)}internal/core/rstring.h @@ -9060,6 +9064,8 @@ marshal.$(OBJEXT): {$(VPATH)}missing.h marshal.$(OBJEXT): {$(VPATH)}node.h marshal.$(OBJEXT): {$(VPATH)}onigmo.h marshal.$(OBJEXT): {$(VPATH)}oniguruma.h +marshal.$(OBJEXT): {$(VPATH)}re.h +marshal.$(OBJEXT): {$(VPATH)}regex.h marshal.$(OBJEXT): {$(VPATH)}ruby_assert.h marshal.$(OBJEXT): {$(VPATH)}ruby_atomic.h marshal.$(OBJEXT): {$(VPATH)}rubyparser.h @@ -10952,6 +10958,7 @@ parse.$(OBJEXT): {$(VPATH)}internal/core/rclass.h parse.$(OBJEXT): {$(VPATH)}internal/core/rdata.h parse.$(OBJEXT): {$(VPATH)}internal/core/rfile.h parse.$(OBJEXT): {$(VPATH)}internal/core/rhash.h +parse.$(OBJEXT): {$(VPATH)}internal/core/rmatch.h parse.$(OBJEXT): {$(VPATH)}internal/core/robject.h parse.$(OBJEXT): {$(VPATH)}internal/core/rregexp.h parse.$(OBJEXT): {$(VPATH)}internal/core/rstring.h @@ -11054,6 +11061,7 @@ parse.$(OBJEXT): {$(VPATH)}parser_st.h parse.$(OBJEXT): {$(VPATH)}probes.dmyh parse.$(OBJEXT): {$(VPATH)}probes.h parse.$(OBJEXT): {$(VPATH)}ractor.h +parse.$(OBJEXT): {$(VPATH)}re.h parse.$(OBJEXT): {$(VPATH)}regenc.h parse.$(OBJEXT): {$(VPATH)}regex.h parse.$(OBJEXT): {$(VPATH)}ruby_assert.h @@ -15946,6 +15954,7 @@ ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rclass.h ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rdata.h ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rfile.h ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rhash.h +ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rmatch.h ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/robject.h ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rregexp.h ruby_parser.$(OBJEXT): {$(VPATH)}internal/core/rstring.h @@ -16037,6 +16046,8 @@ ruby_parser.$(OBJEXT): {$(VPATH)}missing.h ruby_parser.$(OBJEXT): {$(VPATH)}node.h ruby_parser.$(OBJEXT): {$(VPATH)}onigmo.h ruby_parser.$(OBJEXT): {$(VPATH)}oniguruma.h +ruby_parser.$(OBJEXT): {$(VPATH)}re.h +ruby_parser.$(OBJEXT): {$(VPATH)}regex.h ruby_parser.$(OBJEXT): {$(VPATH)}ruby_assert.h ruby_parser.$(OBJEXT): {$(VPATH)}ruby_parser.c ruby_parser.$(OBJEXT): {$(VPATH)}rubyparser.h @@ -19345,6 +19356,7 @@ variable.$(OBJEXT): {$(VPATH)}internal/core/rclass.h variable.$(OBJEXT): {$(VPATH)}internal/core/rdata.h variable.$(OBJEXT): {$(VPATH)}internal/core/rfile.h variable.$(OBJEXT): {$(VPATH)}internal/core/rhash.h +variable.$(OBJEXT): {$(VPATH)}internal/core/rmatch.h variable.$(OBJEXT): {$(VPATH)}internal/core/robject.h variable.$(OBJEXT): {$(VPATH)}internal/core/rregexp.h variable.$(OBJEXT): {$(VPATH)}internal/core/rstring.h @@ -19439,6 +19451,8 @@ variable.$(OBJEXT): {$(VPATH)}onigmo.h variable.$(OBJEXT): {$(VPATH)}oniguruma.h variable.$(OBJEXT): {$(VPATH)}ractor.h variable.$(OBJEXT): {$(VPATH)}ractor_core.h +variable.$(OBJEXT): {$(VPATH)}re.h +variable.$(OBJEXT): {$(VPATH)}regex.h variable.$(OBJEXT): {$(VPATH)}ruby_assert.h variable.$(OBJEXT): {$(VPATH)}ruby_atomic.h variable.$(OBJEXT): {$(VPATH)}rubyparser.h @@ -19839,6 +19853,7 @@ vm.$(OBJEXT): {$(VPATH)}internal/core/rclass.h vm.$(OBJEXT): {$(VPATH)}internal/core/rdata.h vm.$(OBJEXT): {$(VPATH)}internal/core/rfile.h vm.$(OBJEXT): {$(VPATH)}internal/core/rhash.h +vm.$(OBJEXT): {$(VPATH)}internal/core/rmatch.h vm.$(OBJEXT): {$(VPATH)}internal/core/robject.h vm.$(OBJEXT): {$(VPATH)}internal/core/rregexp.h vm.$(OBJEXT): {$(VPATH)}internal/core/rstring.h @@ -19941,6 +19956,8 @@ vm.$(OBJEXT): {$(VPATH)}probes.h vm.$(OBJEXT): {$(VPATH)}probes_helper.h vm.$(OBJEXT): {$(VPATH)}ractor.h vm.$(OBJEXT): {$(VPATH)}ractor_core.h +vm.$(OBJEXT): {$(VPATH)}re.h +vm.$(OBJEXT): {$(VPATH)}regex.h vm.$(OBJEXT): {$(VPATH)}ruby_assert.h vm.$(OBJEXT): {$(VPATH)}ruby_atomic.h vm.$(OBJEXT): {$(VPATH)}rubyparser.h diff --git a/ext/ripper/depend b/ext/ripper/depend index 96d41c87b89ac0..db83378a1d53db 100644 --- a/ext/ripper/depend +++ b/ext/ripper/depend @@ -474,6 +474,7 @@ ripper.o: $(hdrdir)/ruby/internal/core/rclass.h ripper.o: $(hdrdir)/ruby/internal/core/rdata.h ripper.o: $(hdrdir)/ruby/internal/core/rfile.h ripper.o: $(hdrdir)/ruby/internal/core/rhash.h +ripper.o: $(hdrdir)/ruby/internal/core/rmatch.h ripper.o: $(hdrdir)/ruby/internal/core/robject.h ripper.o: $(hdrdir)/ruby/internal/core/rregexp.h ripper.o: $(hdrdir)/ruby/internal/core/rstring.h @@ -566,6 +567,7 @@ ripper.o: $(hdrdir)/ruby/missing.h ripper.o: $(hdrdir)/ruby/onigmo.h ripper.o: $(hdrdir)/ruby/oniguruma.h ripper.o: $(hdrdir)/ruby/ractor.h +ripper.o: $(hdrdir)/ruby/re.h ripper.o: $(hdrdir)/ruby/regex.h ripper.o: $(hdrdir)/ruby/ruby.h ripper.o: $(hdrdir)/ruby/st.h diff --git a/internal/re.h b/internal/re.h index 2d2eba0dc1905c..52a05902adaf23 100644 --- a/internal/re.h +++ b/internal/re.h @@ -10,6 +10,25 @@ */ #include "ruby/internal/stdbool.h" /* for bool */ #include "ruby/ruby.h" /* for VALUE */ +#include "ruby/re.h" /* for struct RMatch and struct re_registers */ + +static inline long +RMATCH_BEG(VALUE match, int i) +{ + return RMATCH(match)->regs.beg[i]; +} + +static inline long +RMATCH_END(VALUE match, int i) +{ + return RMATCH(match)->regs.end[i]; +} + +static inline int +RMATCH_NREGS(VALUE match) +{ + return RMATCH(match)->regs.num_regs; +} /* re.c */ VALUE rb_reg_s_alloc(VALUE klass); diff --git a/re.c b/re.c index e65369424a03a0..e4f580ecc06513 100644 --- a/re.c +++ b/re.c @@ -1179,7 +1179,7 @@ static VALUE match_size(VALUE match) { match_check(match); - return INT2FIX(RMATCH_REGS(match)->num_regs); + return INT2FIX(RMATCH_NREGS(match)); } static int name_to_backref_number(const struct re_registers *, VALUE, const char*, const char*); @@ -1193,9 +1193,9 @@ name_to_backref_error(VALUE name) } static void -backref_number_check(struct re_registers *regs, int i) +backref_number_check(VALUE match, int i) { - if (i < 0 || regs->num_regs <= i) + if (i < 0 || RMATCH_NREGS(match) <= i) rb_raise(rb_eIndexError, "index %d out of matches", i); } @@ -1245,12 +1245,11 @@ static VALUE match_offset(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return rb_assoc_new(Qnil, Qnil); update_char_offset(match); @@ -1280,14 +1279,13 @@ static VALUE match_byteoffset(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return rb_assoc_new(Qnil, Qnil); - return rb_assoc_new(LONG2NUM(BEG(i)), LONG2NUM(END(i))); + return rb_assoc_new(LONG2NUM(RMATCH_BEG(match, i)), LONG2NUM(RMATCH_END(match, i))); } @@ -1304,14 +1302,13 @@ static VALUE match_bytebegin(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return Qnil; - return LONG2NUM(BEG(i)); + return LONG2NUM(RMATCH_BEG(match, i)); } @@ -1328,14 +1325,13 @@ static VALUE match_byteend(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return Qnil; - return LONG2NUM(END(i)); + return LONG2NUM(RMATCH_END(match, i)); } @@ -1352,12 +1348,11 @@ static VALUE match_begin(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return Qnil; update_char_offset(match); @@ -1378,12 +1373,11 @@ static VALUE match_end(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return Qnil; update_char_offset(match); @@ -1420,11 +1414,10 @@ static VALUE match_nth(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); - backref_number_check(regs, i); + backref_number_check(match, i); - long start = BEG(i), end = END(i); + long start = RMATCH_BEG(match, i), end = RMATCH_END(match, i); if (start < 0) return Qnil; @@ -1464,12 +1457,11 @@ static VALUE match_nth_length(VALUE match, VALUE n) { int i = match_backref_number(match, n); - struct re_registers *regs = RMATCH_REGS(match); match_check(match); - backref_number_check(regs, i); + backref_number_check(match, i); - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) return Qnil; update_char_offset(match); @@ -1495,11 +1487,8 @@ rb_match_unbusy(VALUE match) int rb_match_count(VALUE match) { - struct re_registers *regs; if (NIL_P(match)) return -1; - regs = RMATCH_REGS(match); - if (!regs) return -1; - return regs->num_regs; + return RMATCH_NREGS(match); } static void @@ -1892,18 +1881,17 @@ rb_reg_start_with_p(VALUE re, VALUE str) VALUE rb_reg_nth_defined(int nth, VALUE match) { - struct re_registers *regs; if (NIL_P(match)) return Qnil; match_check(match); - regs = RMATCH_REGS(match); - if (nth >= regs->num_regs) { + int num_regs = RMATCH_NREGS(match); + if (nth >= num_regs) { return Qnil; } if (nth < 0) { - nth += regs->num_regs; + nth += num_regs; if (nth <= 0) return Qnil; } - return RBOOL(BEG(nth) != -1); + return RBOOL(RMATCH_BEG(match, nth) != -1); } VALUE @@ -1911,21 +1899,20 @@ rb_reg_nth_match(int nth, VALUE match) { VALUE str; long start, end, len; - struct re_registers *regs; if (NIL_P(match)) return Qnil; match_check(match); - regs = RMATCH_REGS(match); - if (nth >= regs->num_regs) { + int num_regs = RMATCH_NREGS(match); + if (nth >= num_regs) { return Qnil; } if (nth < 0) { - nth += regs->num_regs; + nth += num_regs; if (nth <= 0) return Qnil; } - start = BEG(nth); + start = RMATCH_BEG(match, nth); if (start == -1) return Qnil; - end = END(nth); + end = RMATCH_END(match, nth); len = end - start; str = rb_str_subseq(RMATCH(match)->str, start, len); return str; @@ -1959,13 +1946,11 @@ VALUE rb_reg_match_pre(VALUE match) { VALUE str; - struct re_registers *regs; if (NIL_P(match)) return Qnil; match_check(match); - regs = RMATCH_REGS(match); - if (BEG(0) == -1) return Qnil; - str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0)); + if (RMATCH_BEG(match, 0) == -1) return Qnil; + str = rb_str_subseq(RMATCH(match)->str, 0, RMATCH_BEG(match, 0)); return str; } @@ -1993,14 +1978,12 @@ rb_reg_match_post(VALUE match) { VALUE str; long pos; - struct re_registers *regs; if (NIL_P(match)) return Qnil; match_check(match); - regs = RMATCH_REGS(match); - if (BEG(0) == -1) return Qnil; + if (RMATCH_BEG(match, 0) == -1) return Qnil; str = RMATCH(match)->str; - pos = END(0); + pos = RMATCH_END(match, 0); str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos); return str; } @@ -2009,14 +1992,12 @@ static int match_last_index(VALUE match) { int i; - struct re_registers *regs; if (NIL_P(match)) return -1; match_check(match); - regs = RMATCH_REGS(match); - if (BEG(0) == -1) return -1; + if (RMATCH_BEG(match, 0) == -1) return -1; - for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--) + for (i = RMATCH_NREGS(match) - 1; RMATCH_BEG(match, i) == -1 && i > 0; i--) ; return i; } @@ -2026,8 +2007,8 @@ rb_reg_match_last(VALUE match) { int i = match_last_index(match); if (i <= 0) return Qnil; - struct re_registers *regs = RMATCH_REGS(match); - return rb_str_subseq(RMATCH(match)->str, BEG(i), END(i) - BEG(i)); + long start = RMATCH_BEG(match, i); + return rb_str_subseq(RMATCH(match)->str, start, RMATCH_END(match, i) - start); } VALUE @@ -2065,22 +2046,22 @@ last_paren_match_getter(ID _x, VALUE *_y) static VALUE match_array(VALUE match, int start) { - struct re_registers *regs; VALUE ary; VALUE target; int i; match_check(match); - regs = RMATCH_REGS(match); - ary = rb_ary_new2(regs->num_regs); + int num_regs = RMATCH_NREGS(match); + ary = rb_ary_new2(num_regs); target = RMATCH(match)->str; - for (i=start; inum_regs; i++) { - if (regs->beg[i] == -1) { + for (i = start; i < num_regs; i++) { + long beg = RMATCH_BEG(match, i); + if (beg == -1) { rb_ary_push(ary, Qnil); } else { - VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]); + VALUE str = rb_str_subseq(target, beg, RMATCH_END(match, i) - beg); rb_ary_push(ary, str); } } @@ -2165,7 +2146,7 @@ namev_to_backref_number(const struct re_registers *regs, VALUE re, VALUE name) static VALUE match_ary_subseq(VALUE match, long beg, long len, VALUE result) { - long olen = RMATCH_REGS(match)->num_regs; + long olen = RMATCH_NREGS(match); long j, end = olen < beg+len ? olen : beg+len; if (NIL_P(result)) result = rb_ary_new_capa(len); if (len == 0) return result; @@ -2183,7 +2164,7 @@ static VALUE match_ary_aref(VALUE match, VALUE idx, VALUE result) { long beg, len; - int num_regs = RMATCH_REGS(match)->num_regs; + int num_regs = RMATCH_NREGS(match); /* check if idx is Range */ switch (rb_range_beg_len(idx, &beg, &len, (long)num_regs, !NIL_P(result))) { @@ -2261,7 +2242,7 @@ match_aref(int argc, VALUE *argv, VALUE match) else { long beg = NUM2LONG(idx); long len = NUM2LONG(length); - long num_regs = RMATCH_REGS(match)->num_regs; + long num_regs = RMATCH_NREGS(match); if (len < 0) { return Qnil; } @@ -2601,8 +2582,7 @@ match_inspect(VALUE match) VALUE cname = rb_class_path(rb_obj_class(match)); VALUE str; int i; - struct re_registers *regs = RMATCH_REGS(match); - int num_regs = regs->num_regs; + int num_regs = RMATCH_NREGS(match); struct backref_name_tag *names; VALUE names_obj = Qnil; VALUE regexp = RMATCH(match)->regexp; diff --git a/string.c b/string.c index dae7700887c4c4..f179b816e8f52d 100644 --- a/string.c +++ b/string.c @@ -4620,8 +4620,7 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) if (rb_reg_search(sub, str, pos, 0) >= 0) { VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - pos = rb_str_sublen(str, BEG(0)); + pos = rb_str_sublen(str, RMATCH_BEG(match, 0)); return LONG2NUM(pos); } } @@ -4747,8 +4746,7 @@ rb_str_byteindex_m(int argc, VALUE *argv, VALUE str) if (RB_TYPE_P(sub, T_REGEXP)) { if (rb_reg_search(sub, str, pos, 0) >= 0) { VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - pos = BEG(0); + pos = RMATCH_BEG(match, 0); return LONG2NUM(pos); } } @@ -4879,8 +4877,7 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) if (rb_reg_search(sub, str, pos, 1) >= 0) { VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - pos = rb_str_sublen(str, BEG(0)); + pos = rb_str_sublen(str, RMATCH_BEG(match, 0)); return LONG2NUM(pos); } } @@ -5037,8 +5034,7 @@ rb_str_byterindex_m(int argc, VALUE *argv, VALUE str) if (RB_TYPE_P(sub, T_REGEXP)) { if (rb_reg_search(sub, str, pos, 1) >= 0) { VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - pos = BEG(0); + pos = RMATCH_BEG(match, 0); return LONG2NUM(pos); } } @@ -5915,26 +5911,25 @@ rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) VALUE match; long start, end, len; rb_encoding *enc; - struct re_registers *regs; if (rb_reg_search(re, str, 0, 0) < 0) { rb_raise(rb_eIndexError, "regexp not matched"); } match = rb_backref_get(); nth = rb_reg_backref_number(match, backref); - regs = RMATCH_REGS(match); - if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) { + int num_regs = RMATCH_NREGS(match); + if ((nth >= num_regs) || ((nth < 0) && (-nth >= num_regs))) { rb_raise(rb_eIndexError, "index %d out of regexp", nth); } if (nth < 0) { - nth += regs->num_regs; + nth += num_regs; } - start = BEG(nth); + start = RMATCH_BEG(match, nth); if (start == -1) { rb_raise(rb_eIndexError, "regexp group %d not matched", nth); } - end = END(nth); + end = RMATCH_END(match, nth); len = end - start; StringValue(val); enc = rb_enc_check_str(str, val); @@ -6069,14 +6064,14 @@ rb_str_slice_bang(int argc, VALUE *argv, VALUE str) if (RB_TYPE_P(indx, T_REGEXP)) { if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil; VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); + int num_regs = RMATCH_NREGS(match); int nth = 0; if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) { - if ((nth += regs->num_regs) <= 0) return Qnil; + if ((nth += num_regs) <= 0) return Qnil; } - else if (nth >= regs->num_regs) return Qnil; - beg = BEG(nth); - len = END(nth) - beg; + else if (nth >= num_regs) return Qnil; + beg = RMATCH_BEG(match, nth); + len = RMATCH_END(match, nth) - beg; goto subseq; } else if (argc == 2) { @@ -9337,18 +9332,16 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) if (result) result = rb_ary_new(); long len = RSTRING_LEN(str); long start = beg; - long idx; + int idx; int last_null = 0; - struct re_registers *regs; VALUE match = 0; for (; rb_reg_search(spat, str, start, 0) >= 0; (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) { match = rb_backref_get(); if (!result) rb_match_busy(match); - regs = RMATCH_REGS(match); - end = BEG(0); - if (start == end && BEG(0) == END(0)) { + end = RMATCH_BEG(match, 0); + if (start == end && RMATCH_BEG(match, 0) == RMATCH_END(match, 0)) { if (!ptr) { SPLIT_STR(0, 0); break; @@ -9368,13 +9361,13 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } else { SPLIT_STR(beg, end-beg); - beg = start = END(0); + beg = start = RMATCH_END(match, 0); } last_null = 0; - for (idx=1; idx < regs->num_regs; idx++) { - if (BEG(idx) == -1) continue; - SPLIT_STR(BEG(idx), END(idx)-BEG(idx)); + for (idx = 1; idx < RMATCH_NREGS(match); idx++) { + if (RMATCH_BEG(match, idx) == -1) continue; + SPLIT_STR(RMATCH_BEG(match, idx), RMATCH_END(match, idx) - RMATCH_BEG(match, idx)); } if (!NIL_P(limit) && lim <= ++i) break; } @@ -11255,10 +11248,9 @@ rb_str_partition(VALUE str, VALUE sep) goto failed; } VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - pos = BEG(0); - sep = rb_str_subseq(str, pos, END(0) - pos); + pos = RMATCH_BEG(match, 0); + sep = rb_str_subseq(str, pos, RMATCH_END(match, 0) - pos); } else { pos = rb_str_index(str, sep, 0); @@ -11292,10 +11284,9 @@ rb_str_rpartition(VALUE str, VALUE sep) goto failed; } VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - pos = BEG(0); - sep = rb_str_subseq(str, pos, END(0) - pos); + pos = RMATCH_BEG(match, 0); + sep = rb_str_subseq(str, pos, RMATCH_END(match, 0) - pos); } else { pos = rb_str_sublen(str, pos); From ee19cef31e3c2e38056778103a7f878afe8d99bf Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Fri, 8 May 2026 12:38:55 -0700 Subject: [PATCH 09/12] Replace BEG/END with RMATCH_BEG/RMATCH_END --- re.c | 28 +++++++++++++--------------- string.c | 31 +++++++++++++------------------ 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/re.c b/re.c index e4f580ecc06513..de46c0e7ca85b7 100644 --- a/re.c +++ b/re.c @@ -1004,7 +1004,6 @@ static void update_char_offset(VALUE match) { struct RMatch *rm = RMATCH(match); - struct re_registers *regs; int i, num_regs, num_pos; long c; char *s, *p, *q; @@ -1015,8 +1014,7 @@ update_char_offset(VALUE match) if (rm->char_offset_num_allocated) return; - regs = &rm->regs; - num_regs = rm->regs.num_regs; + num_regs = RMATCH_NREGS(match); if (rm->char_offset_num_allocated < num_regs) { SIZED_REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs, rm->char_offset_num_allocated); @@ -1026,8 +1024,8 @@ update_char_offset(VALUE match) enc = rb_enc_get(RMATCH(match)->str); if (rb_enc_mbmaxlen(enc) == 1) { for (i = 0; i < num_regs; i++) { - rm->char_offset[i].beg = BEG(i); - rm->char_offset[i].end = END(i); + rm->char_offset[i].beg = RMATCH_BEG(match, i); + rm->char_offset[i].end = RMATCH_END(match, i); } return; } @@ -1035,10 +1033,10 @@ update_char_offset(VALUE match) pairs = RB_ALLOCV_N(pair_t, pairs_obj, num_regs * 2); num_pos = 0; for (i = 0; i < num_regs; i++) { - if (BEG(i) < 0) + if (RMATCH_BEG(match, i) < 0) continue; - pairs[num_pos++].byte_pos = BEG(i); - pairs[num_pos++].byte_pos = END(i); + pairs[num_pos++].byte_pos = RMATCH_BEG(match, i); + pairs[num_pos++].byte_pos = RMATCH_END(match, i); } qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); @@ -1053,17 +1051,17 @@ update_char_offset(VALUE match) for (i = 0; i < num_regs; i++) { pair_t key, *found; - if (BEG(i) < 0) { + if (RMATCH_BEG(match, i) < 0) { rm->char_offset[i].beg = -1; rm->char_offset[i].end = -1; continue; } - key.byte_pos = BEG(i); + key.byte_pos = RMATCH_BEG(match, i); found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); rm->char_offset[i].beg = found->char_pos; - key.byte_pos = END(i); + key.byte_pos = RMATCH_END(match, i); found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); rm->char_offset[i].end = found->char_pos; } @@ -3637,7 +3635,7 @@ match_integer_at(int argc, VALUE *argv, VALUE match) int base = 10; VALUE idx; - long nth; + int nth; argc = rb_check_arity(argc, 1, 2); if (FIXNUM_P(idx = argv[0])) { @@ -3651,10 +3649,10 @@ match_integer_at(int argc, VALUE *argv, VALUE match) rb_raise(rb_eArgError, "invalid radix %d", base); } - if (nth >= regs->num_regs) return Qnil; - if (nth < 0 && (nth += regs->num_regs) <= 0) return Qnil; + if (nth >= RMATCH_NREGS(match)) return Qnil; + if (nth < 0 && (nth += RMATCH_NREGS(match)) <= 0) return Qnil; - long start = BEG(nth), end = END(nth); + long start = RMATCH_BEG(match, nth), end = RMATCH_END(match, nth); if (start < 0) return Qnil; RUBY_ASSERT(start <= end, "%ld > %ld", start, end); diff --git a/string.c b/string.c index f179b816e8f52d..2d7bd4ee74401c 100644 --- a/string.c +++ b/string.c @@ -60,9 +60,6 @@ # define HAVE_CRYPT_R 1 #endif -#define BEG(no) (regs->beg[(no)]) -#define END(no) (regs->end[(no)]) - #undef rb_str_new #undef rb_usascii_str_new #undef rb_utf8_str_new @@ -6273,8 +6270,8 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) match0 = pat; } else { - beg0 = BEG(0); - end0 = END(0); + beg0 = RMATCH_BEG(match, 0); + end0 = RMATCH_END(match, 0); if (iter) match0 = rb_reg_nth_match(0, match); } @@ -6419,8 +6416,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) match0 = pat; } else { - beg0 = BEG(0); - end0 = END(0); + beg0 = RMATCH_BEG(match, 0); + end0 = RMATCH_END(match, 0); if (mode == ITER) match0 = rb_reg_nth_match(0, match); } @@ -10645,17 +10642,14 @@ scan_once(VALUE str, VALUE pat, long *start, int set_backref_str) VALUE result = Qnil; long end, pos = rb_pat_search(pat, str, *start, set_backref_str); if (pos >= 0) { - VALUE match; - struct re_registers *regs; + VALUE match = Qnil; if (BUILTIN_TYPE(pat) == T_STRING) { - regs = NULL; end = pos + RSTRING_LEN(pat); } else { match = rb_backref_get(); - regs = RMATCH_REGS(match); - pos = BEG(0); - end = END(0); + pos = RMATCH_BEG(match, 0); + end = RMATCH_END(match, 0); } if (pos == end) { @@ -10673,16 +10667,17 @@ scan_once(VALUE str, VALUE pat, long *start, int set_backref_str) *start = end; } - if (!regs || regs->num_regs == 1) { + if (NIL_P(match) || RMATCH_NREGS(match) == 1) { result = rb_str_subseq(str, pos, end - pos); return result; } else { - result = rb_ary_new2(regs->num_regs); - for (int i = 1; i < regs->num_regs; i++) { + int num_regs = RMATCH_NREGS(match); + result = rb_ary_new2(num_regs); + for (int i = 1; i < num_regs; i++) { VALUE s = Qnil; - if (BEG(i) >= 0) { - s = rb_str_subseq(str, BEG(i), END(i)-BEG(i)); + if (RMATCH_BEG(match, i) >= 0) { + s = rb_str_subseq(str, RMATCH_BEG(match, i), RMATCH_END(match, i) - RMATCH_BEG(match, i)); } rb_ary_push(result, s); From 12bb8955263bb8cb82bdcd642556ab360cbb7b12 Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Fri, 8 May 2026 12:46:41 -0700 Subject: [PATCH 10/12] Introduce RMATCH_{BEG,END}_PTR --- internal/re.h | 16 ++++++++++++++-- re.c | 20 ++++++++------------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/internal/re.h b/internal/re.h index 52a05902adaf23..aa1c93f64275dd 100644 --- a/internal/re.h +++ b/internal/re.h @@ -12,16 +12,28 @@ #include "ruby/ruby.h" /* for VALUE */ #include "ruby/re.h" /* for struct RMatch and struct re_registers */ +static inline OnigPosition * +RMATCH_BEG_PTR(VALUE match) +{ + return RMATCH(match)->regs.beg; +} + +static inline OnigPosition * +RMATCH_END_PTR(VALUE match) +{ + return RMATCH(match)->regs.end; +} + static inline long RMATCH_BEG(VALUE match, int i) { - return RMATCH(match)->regs.beg[i]; + return RMATCH_BEG_PTR(match)[i]; } static inline long RMATCH_END(VALUE match, int i) { - return RMATCH(match)->regs.end[i]; + return RMATCH_END_PTR(match)[i]; } static inline int diff --git a/re.c b/re.c index de46c0e7ca85b7..bb6af74eb57f36 100644 --- a/re.c +++ b/re.c @@ -3553,16 +3553,15 @@ rb_reg_equal(VALUE re1, VALUE re2) static VALUE match_hash(VALUE match) { - const struct re_registers *regs; st_index_t hashval; match_check(match); hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str)); hashval = rb_hash_uint(hashval, reg_hash(match_regexp(match))); - regs = RMATCH_REGS(match); - hashval = rb_hash_uint(hashval, regs->num_regs); - hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg))); - hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end))); + int num_regs = RMATCH_NREGS(match); + hashval = rb_hash_uint(hashval, num_regs); + hashval = rb_hash_uint(hashval, rb_memhash(RMATCH_BEG_PTR(match), num_regs * sizeof(OnigPosition))); + hashval = rb_hash_uint(hashval, rb_memhash(RMATCH_END_PTR(match), num_regs * sizeof(OnigPosition))); hashval = rb_hash_end(hashval); return ST2FIX(hashval); } @@ -3579,18 +3578,15 @@ match_hash(VALUE match) static VALUE match_equal(VALUE match1, VALUE match2) { - const struct re_registers *regs1, *regs2; - if (match1 == match2) return Qtrue; if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse; if (!RMATCH(match1)->regexp || !RMATCH(match2)->regexp) return Qfalse; if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse; if (!rb_reg_equal(match_regexp(match1), match_regexp(match2))) return Qfalse; - regs1 = RMATCH_REGS(match1); - regs2 = RMATCH_REGS(match2); - if (regs1->num_regs != regs2->num_regs) return Qfalse; - if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse; - if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse; + int num_regs = RMATCH_NREGS(match1); + if (num_regs != RMATCH_NREGS(match2)) return Qfalse; + if (memcmp(RMATCH_BEG_PTR(match1), RMATCH_BEG_PTR(match2), num_regs * sizeof(OnigPosition))) return Qfalse; + if (memcmp(RMATCH_END_PTR(match1), RMATCH_END_PTR(match2), num_regs * sizeof(OnigPosition))) return Qfalse; return Qtrue; } From e9e4647e6667743e26db037beaa6a56bc8c70f48 Mon Sep 17 00:00:00 2001 From: Aaron Patterson Date: Fri, 8 May 2026 14:23:18 -0700 Subject: [PATCH 11/12] ZJIT: add an unreachable instruction (#16901) Unreachable instructions terminate blocks. We'll use this mostly for testing as a terminator instruction (since traditional BB's will require all blocks to end with a terminator) --- zjit/src/asm/arm64/inst/mod.rs | 2 ++ zjit/src/asm/arm64/inst/udf.rs | 52 ++++++++++++++++++++++++++++++++++ zjit/src/asm/arm64/mod.rs | 6 ++++ zjit/src/backend/arm64/mod.rs | 3 ++ zjit/src/backend/lir.rs | 13 +++++++-- zjit/src/backend/x86_64/mod.rs | 1 + zjit/src/codegen.rs | 1 + zjit/src/hir.rs | 16 +++++++---- 8 files changed, 87 insertions(+), 7 deletions(-) create mode 100644 zjit/src/asm/arm64/inst/udf.rs diff --git a/zjit/src/asm/arm64/inst/mod.rs b/zjit/src/asm/arm64/inst/mod.rs index bfffd914efe29a..270c784f270410 100644 --- a/zjit/src/asm/arm64/inst/mod.rs +++ b/zjit/src/asm/arm64/inst/mod.rs @@ -26,6 +26,7 @@ mod sbfm; mod shift_imm; mod sys_reg; mod test_bit; +mod udf; pub use atomic::Atomic; pub use branch::Branch; @@ -52,3 +53,4 @@ pub use sbfm::SBFM; pub use shift_imm::ShiftImm; pub use sys_reg::SysReg; pub use test_bit::TestBit; +pub use udf::Udf; diff --git a/zjit/src/asm/arm64/inst/udf.rs b/zjit/src/asm/arm64/inst/udf.rs new file mode 100644 index 00000000000000..297d17ed628adf --- /dev/null +++ b/zjit/src/asm/arm64/inst/udf.rs @@ -0,0 +1,52 @@ +/// The struct that represents an A64 permanently undefined instruction. +/// +/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+ +/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 | +/// | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | +/// | imm16..................................................| +/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+ +/// +pub struct Udf { + /// The immediate value encoded in the instruction + imm16: u16 +} + +impl Udf { + /// UDF - Permanently Undefined + /// + pub fn udf(imm16: u16) -> Self { + Self { imm16 } + } +} + +impl From for u32 { + /// Convert an instruction into a 32-bit value. + fn from(inst: Udf) -> Self { + inst.imm16 as u32 + } +} + +impl From for [u8; 4] { + /// Convert an instruction into a 4 byte array. + fn from(inst: Udf) -> [u8; 4] { + let result: u32 = inst.into(); + result.to_le_bytes() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_udf() { + let result: u32 = Udf::udf(0).into(); + assert_eq!(0x00000000, result); + } + + #[test] + fn test_udf_imm() { + let result: u32 = Udf::udf(1).into(); + assert_eq!(0x00000001, result); + } +} diff --git a/zjit/src/asm/arm64/mod.rs b/zjit/src/asm/arm64/mod.rs index a360d7738b2dbf..b53f1cf6733664 100644 --- a/zjit/src/asm/arm64/mod.rs +++ b/zjit/src/asm/arm64/mod.rs @@ -321,6 +321,12 @@ pub fn brk(cb: &mut CodeBlock, imm16: A64Opnd) { cb.write_bytes(&bytes); } +/// UDF - permanently undefined instruction +pub fn udf(cb: &mut CodeBlock, imm16: u16) { + let bytes: [u8; 4] = Udf::udf(imm16).into(); + cb.write_bytes(&bytes); +} + /// CMP - compare rn and rm, update flags pub fn cmp(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd) { let bytes: [u8; 4] = match (rn, rm) { diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs index 54c803168dc3a7..4d7aa2c9533186 100644 --- a/zjit/src/backend/arm64/mod.rs +++ b/zjit/src/backend/arm64/mod.rs @@ -1561,6 +1561,9 @@ impl Assembler { Insn::Breakpoint => { brk(cb, A64Opnd::None); }, + Insn::Abort => { + udf(cb, u16::MAX); + }, Insn::CSelZ { truthy, falsy, out } | Insn::CSelE { truthy, falsy, out } => { csel(cb, out.into(), truthy.into(), falsy.into(), Condition::EQ); diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs index bb8d1e1e735b03..7335680f84fac1 100644 --- a/zjit/src/backend/lir.rs +++ b/zjit/src/backend/lir.rs @@ -653,6 +653,9 @@ pub enum Insn { #[allow(dead_code)] Breakpoint, + // Abort the process + Abort, + /// Add a comment into the IR at the point that this instruction is added. /// It won't have any impact on that actual compiled code. Comment(String), @@ -895,6 +898,7 @@ impl Insn { Insn::And { .. } => "And", Insn::BakeString(_) => "BakeString", Insn::Breakpoint => "Breakpoint", + Insn::Abort => "Abort", Insn::Comment(_) => "Comment", Insn::Cmp { .. } => "Cmp", Insn::CPop { .. } => "CPop", @@ -1185,7 +1189,7 @@ impl<'a> Iterator for InsnOpndIterator<'a> { } Insn::BakeString(_) | - Insn::Breakpoint | + Insn::Breakpoint | Insn::Abort | Insn::Comment(_) | Insn::CPop { .. } | Insn::PadPatchPoint | @@ -1363,7 +1367,7 @@ impl<'a> InsnOpndMutIterator<'a> { } Insn::BakeString(_) | - Insn::Breakpoint | + Insn::Breakpoint | Insn::Abort | Insn::Comment(_) | Insn::CPop { .. } | Insn::FrameSetup { .. } | @@ -3465,6 +3469,11 @@ impl Assembler { self.push_insn(Insn::Breakpoint); } + #[allow(dead_code)] + pub fn abort(&mut self) { + self.push_insn(Insn::Abort); + } + /// Call a C function without PosMarkers pub fn ccall(&mut self, fptr: *const u8, opnds: Vec) -> Opnd { let canary_opnd = self.set_stack_canary(); diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs index 3904bfd71f3dcb..a3af9856dab291 100644 --- a/zjit/src/backend/x86_64/mod.rs +++ b/zjit/src/backend/x86_64/mod.rs @@ -1089,6 +1089,7 @@ impl Assembler { }, Insn::Breakpoint => int3(cb), + Insn::Abort => ud2(cb), Insn::CSelZ { truthy, falsy, out } => { emit_csel(cb, *truthy, *falsy, *out, cmovz, cmovnz); diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index b9b8b6509abfb2..097257ddf85ede 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -754,6 +754,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio Insn::ObjToString { val, cd, state, .. } => gen_objtostring(jit, asm, opnd!(val), *cd, &function.frame_state(*state)), &Insn::CheckInterrupts { state } => no_output!(gen_check_interrupts(jit, asm, &function.frame_state(state))), Insn::BreakPoint => no_output!(asm.breakpoint()), + Insn::Unreachable => no_output!(asm.abort()), &Insn::HashDup { val, state } => { gen_hash_dup(asm, opnd!(val), &function.frame_state(state)) }, &Insn::HashAref { hash, key, state } => { gen_hash_aref(jit, asm, opnd!(hash), opnd!(key), &function.frame_state(state)) }, &Insn::HashAset { hash, key, val, state } => { no_output!(gen_hash_aset(jit, asm, opnd!(hash), opnd!(key), opnd!(val), &function.frame_state(state))) }, diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index 27bf5df42741ee..1d8358cbada039 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -1147,6 +1147,11 @@ pub enum Insn { CheckInterrupts { state: InsnId }, BreakPoint, + + /// Only use this instruction in tests where you need to end a block with + /// a terminator, but don't ever expect the code to be executed. This + /// instruction should never be generated from iseq_to_hir + Unreachable, } /// Macro that enumerates all operands of an Insn, dispatching to caller-provided @@ -1165,7 +1170,7 @@ macro_rules! for_each_operand_impl { | Insn::LoadEC | Insn::GetEP { .. } | Insn::LoadSelf - | Insn::BreakPoint + | Insn::BreakPoint | Insn::Unreachable | Insn::PutSpecialObject { .. } | Insn::IncrCounter(_) | Insn::IncrCounterPtr { .. } => {} @@ -1471,7 +1476,7 @@ impl Insn { | Insn::PatchPoint { .. } | Insn::SetIvar { .. } | Insn::SetClassVar { .. } | Insn::ArrayExtend { .. } | Insn::ArrayPush { .. } | Insn::SideExit { .. } | Insn::SetGlobal { .. } | Insn::SetLocal { .. } | Insn::Throw { .. } | Insn::IncrCounter(_) | Insn::IncrCounterPtr { .. } - | Insn::CheckInterrupts { .. } | Insn::BreakPoint + | Insn::CheckInterrupts { .. } | Insn::BreakPoint | Insn::Unreachable | Insn::StoreField { .. } | Insn::WriteBarrier { .. } | Insn::HashAset { .. } | Insn::ArrayAset { .. } => false, _ => true, @@ -1698,7 +1703,7 @@ impl Insn { abstract_heaps::Control ), Insn::Entries { .. } => effects::Any, - Insn::BreakPoint => Effect::read_write(abstract_heaps::Empty, abstract_heaps::Control), + Insn::BreakPoint | Insn::Unreachable => Effect::read_write(abstract_heaps::Empty, abstract_heaps::Control), } } @@ -2223,6 +2228,7 @@ impl<'a> std::fmt::Display for InsnPrinter<'a> { Insn::CheckInterrupts { .. } => write!(f, "CheckInterrupts"), Insn::IsA { val, class } => write!(f, "IsA {val}, {class}"), Insn::BreakPoint => write!(f, "BreakPoint"), + Insn::Unreachable => write!(f, "Unreachable"), } } } @@ -2837,7 +2843,7 @@ impl Function { | Insn::PatchPoint { .. } | Insn::SetIvar { .. } | Insn::SetClassVar { .. } | Insn::ArrayExtend { .. } | Insn::ArrayPush { .. } | Insn::SideExit { .. } | Insn::SetLocal { .. } | Insn::IncrCounter(_) | Insn::IncrCounterPtr { .. } - | Insn::CheckInterrupts { .. } | Insn::BreakPoint + | Insn::CheckInterrupts { .. } | Insn::BreakPoint | Insn::Unreachable | Insn::StoreField { .. } | Insn::WriteBarrier { .. } | Insn::HashAset { .. } | Insn::ArrayAset { .. } => panic!("Cannot infer type of instruction with no output: {}. See Insn::has_output().", self.insns[insn.0]), Insn::Const { val: Const::Value(val) } => Type::from_value(*val), @@ -5810,7 +5816,7 @@ impl Function { | Insn::LoadSP | Insn::LoadEC | Insn::GetEP { .. } - | Insn::BreakPoint + | Insn::BreakPoint | Insn::Unreachable | Insn::LoadSelf | Insn::Snapshot { .. } | Insn::Jump { .. } From 5faeea873ba4249c685c7f390e939560d736fea8 Mon Sep 17 00:00:00 2001 From: Peter Zhu Date: Fri, 8 May 2026 18:02:12 -0400 Subject: [PATCH 12/12] Set EC in rb_gc_event_hook The event hook may use the EC and it will be null when it is running from a GC thread. --- gc.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/gc.c b/gc.c index 0976d6b7b0b0cb..1d3f6fa6ed15be 100644 --- a/gc.c +++ b/gc.c @@ -242,7 +242,30 @@ rb_gc_event_hook(VALUE obj, rb_event_flag_t event) rb_execution_context_t *ec = rb_gc_get_ec(); if (!ec->cfp) return; +#if USE_MODULAR_GC + bool gc_thread_p = false; + if (!GET_EC()) { + gc_thread_p = true; + +# ifdef RB_THREAD_LOCAL_SPECIFIER + rb_current_ec_set(ec); +# else + native_tls_set(ruby_current_ec_key, ec); +# endif + } +#endif + EXEC_EVENT_HOOK(ec, event, ec->cfp->self, 0, 0, 0, obj); + +#if USE_MODULAR_GC + if (gc_thread_p) { +# ifdef RB_THREAD_LOCAL_SPECIFIER + rb_current_ec_set(NULL); +# else + native_tls_set(ruby_current_ec_key, NULL); +# endif + } +#endif } void *