From 772b97e48d30ad97b1f05d3681461f3bafbbd0b0 Mon Sep 17 00:00:00 2001
From: Sigilante <neal.davis@protonmail.com>
Date: Sun, 31 May 2026 07:28:47 -0500
Subject: [PATCH 1/4] noun: migrate lagoon jet to stateless SoftBLAS

Bump ext/softblas to urbit/SoftBLAS 40ff4fa (allocation-free; dropped the
global softblas_roundingMode for a per-call rndMode arg; B1-B5 correctness
fixes) and drop the now-deleted src/softblas_state.c from the wrapper.

Update the lagoon jet for the new API: thread the active rounding mode
through all 32 BLAS calls via a file-static _la_rnd (replacing the removed
global), and rename the local _set_rounding -> _set_rounding_la to avoid
colliding with softblas.h's new inline _set_rounding.

The jet source matches numerics canonical (lagoon/vere). Built with
zig 0.15.2; add/dot/mmul/mod verified correct on a fake ~hex.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ext/softblas/build.zig     |   1 -
 ext/softblas/build.zig.zon |   4 +-
 pkg/noun/jets/i/lagoon.c   | 434 ++++++++++++++++++++-----------------
 3 files changed, 240 insertions(+), 199 deletions(-)

diff --git a/ext/softblas/build.zig b/ext/softblas/build.zig
index 560e4977fc..2bf3e5e7eb 100644
--- a/ext/softblas/build.zig
+++ b/ext/softblas/build.zig
@@ -24,7 +24,6 @@ pub fn build(b: *std.Build) void {
     lib.addCSourceFiles(.{
         .root = dep_c.path(""),
         .files = &.{
-            "src/softblas_state.c",
             "src/blas/level1/sasum.c",
             "src/blas/level1/dasum.c",
             "src/blas/level1/hasum.c",
diff --git a/ext/softblas/build.zig.zon b/ext/softblas/build.zig.zon
index 8f657dfb98..df0a8c68aa 100644
--- a/ext/softblas/build.zig.zon
+++ b/ext/softblas/build.zig.zon
@@ -7,8 +7,8 @@
             .path = "../softfloat",
         },
         .softblas = .{
-            .url = "https://github.com/urbit/SoftBLAS/archive/cbffb33f19ea02f9ffbd184d445123c57929ec53.tar.gz",
-            .hash = "N-V-__8AAJD5MgBhfBHYae8jFlcaQw9R-TRw4tcUFB3rO9-q",
+            .url = "https://github.com/urbit/SoftBLAS/archive/40ff4fa0a3fba0a566ac5fcb8ab93d280fddf7af.tar.gz",
+            .hash = "N-V-__8AAObNNABARc0F3nVusDf1nQ4yxZH78h5A6HkWWAA9",
         },
     },
     .paths = .{
diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c
index db7088b96e..23a818188d 100644
--- a/pkg/noun/jets/i/lagoon.c
+++ b/pkg/noun/jets/i/lagoon.c
@@ -37,9 +37,16 @@
     c3_d c[2];
   };
 
-  //  $?(%n %u %d %z %a)
+  //  SoftBLAS is now stateless: rounding is passed per call. This holds
+  //  the active mode (replaces the old softblas_roundingMode global).
+  static c3_y _la_rnd = 'n';
+
+  //  Set the SoftFloat/SoftBLAS rounding mode from a rounding-mode atom.
+  //  Accepts %n %u %d %z (see +$rounding-mode); %a (nearest, ties away)
+  //  is handled too, but the hoon @rs/@rd/@rq/@rh doors never produce it.
+  //  Any other value bails.
   static inline void
-  _set_rounding(c3_w a)
+  _set_rounding_la(c3_w a)
   {
     // We could use SoftBLAS set_rounding() to set the SoftFloat
     // mode as well, but it's more explicit to do it here since
@@ -53,27 +60,27 @@
     // %n - near
     case c3__n:
       softfloat_roundingMode = softfloat_round_near_even;
-      softblas_roundingMode = 'n';
+      _la_rnd = 'n';
       break;
     // %z - zero
     case c3__z:
       softfloat_roundingMode = softfloat_round_minMag;
-      softblas_roundingMode = 'z';
+      _la_rnd = 'z';
       break;
     // %u - up
     case c3__u:
       softfloat_roundingMode = softfloat_round_max;
-      softblas_roundingMode = 'u';
+      _la_rnd = 'u';
       break;
     // %d - down
     case c3__d:
       softfloat_roundingMode = softfloat_round_min;
-      softblas_roundingMode = 'd';
+      _la_rnd = 'd';
       break;
     // %a - away
     case c3__a:
       softfloat_roundingMode = softfloat_round_near_maxMag;
-      softblas_roundingMode = 'a';
+      _la_rnd = 'a';
       break;
     }
   }
@@ -152,24 +159,25 @@
 
     // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy)
     c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y));
-    u3r_bytes(0, syz_x+1, y_bytes, y_data);
+    u3r_bytes(0, syz_x, y_bytes, y_data);
+    y_bytes[syz_x] = 0x1;
     
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
       case 4:
-        haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1);
+        haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1, _la_rnd);
         break;
 
       case 5:
-        saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1);
+        saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1, _la_rnd);
         break;
 
       case 6:
-        daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1);
+        daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1, _la_rnd);
         break;
 
       case 7:
-        qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1);
+        qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1, _la_rnd);
         break;
     }
 
@@ -204,35 +212,36 @@
     // syz_x is length in bytes
     c3_d syz_x = len_x * pow(2, bloq-3);
 
-    // x_bytes is the data array (w/o leading 0x1)
-    c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y));
+    // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy; holds result)
+    c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y));
     u3r_bytes(0, syz_x, x_bytes, x_data);
+    x_bytes[syz_x] = 0x1;
 
-    // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy)
-    c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y));
-    u3r_bytes(0, syz_x+1, y_bytes, y_data);
-    
-    //  Switch on the block size.
+    // y_bytes is the data array (w/o leading 0x1)
+    c3_y* y_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y));
+    u3r_bytes(0, syz_x, y_bytes, y_data);
+
+    //  Switch on the block size.  Computes x_bytes := -1*y + x = x - y.
     switch (u3x_atom(bloq)) {
       case 4:
-        haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1);
+        haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 5:
-        saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1);
+        saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 6:
-        daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1);
+        daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 7:
-        qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1);
+        qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1, _la_rnd);
         break;
     }
 
     // r_data is the result noun of [data]
-    u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes);
+    u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes);
 
     //  Clean up and return.
     u3a_free(x_bytes);
@@ -269,7 +278,8 @@
 
     // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy)
     c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y));
-    u3r_bytes(0, syz_x+1, y_bytes, y_data);
+    u3r_bytes(0, syz_x, y_bytes, y_data);
+    y_bytes[syz_x] = 0x1;
 
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
@@ -335,7 +345,8 @@
 
     // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy)
     c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y));
-    u3r_bytes(0, syz_x+1, y_bytes, y_data);
+    u3r_bytes(0, syz_x, y_bytes, y_data);
+    y_bytes[syz_x] = 0x1;
 
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
@@ -401,7 +412,8 @@
 
     // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy)
     c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y));
-    u3r_bytes(0, syz_x+1, y_bytes, y_data);
+    u3r_bytes(0, syz_x, y_bytes, y_data);
+    y_bytes[syz_x] = 0x1;
 
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
@@ -411,12 +423,12 @@
           float16_t y_val16 = ((float16_t*)y_bytes)[i];
           // Perform division x/n
           float16_t div_result16 = f16_div(x_val16, y_val16);
-          // Compute floor of the division result
-          c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false);
-          float16_t floor_float16 = i64_to_f16(floor_result16);
-          // Multiply n by floor(x/n)
-          float16_t mult_result16 = f16_mul(y_val16, floor_float16);
-          // Compute remainder: x - n * floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false);
+          float16_t quot_round_f16 = i64_to_f16(quot_round16);
+          // Multiply n by round(x/n)
+          float16_t mult_result16 = f16_mul(y_val16, quot_round_f16);
+          // Compute remainder: x - n * round(x/n)
           ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16);
         }
         break;
@@ -427,12 +439,12 @@
           float32_t y_val32 = ((float32_t*)y_bytes)[i];
           // Perform division x/n
           float32_t div_result32 = f32_div(x_val32, y_val32);
-          // Compute floor of the division result
-          c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false);
-          float32_t floor_float32 = i64_to_f32(floor_result32);
-          // Multiply n by floor(x/n)
-          float32_t mult_result32 = f32_mul(y_val32, floor_float32);
-          // Compute remainder: x - n * floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false);
+          float32_t quot_round_f32 = i64_to_f32(quot_round32);
+          // Multiply n by round(x/n)
+          float32_t mult_result32 = f32_mul(y_val32, quot_round_f32);
+          // Compute remainder: x - n * round(x/n)
           ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32);
         }
         break;
@@ -443,12 +455,12 @@
           float64_t y_val64 = ((float64_t*)y_bytes)[i];
           // Perform division x/n
           float64_t div_result64 = f64_div(x_val64, y_val64);
-          // Compute floor of the division result
-          c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false);
-          float64_t floor_float64 = i64_to_f64(floor_result64);
-          // Multiply n by floor(x/n)
-          float64_t mult_result64 = f64_mul(y_val64, floor_float64);
-          // Compute remainder: x - n * floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false);
+          float64_t quot_round_f64 = i64_to_f64(quot_round64);
+          // Multiply n by round(x/n)
+          float64_t mult_result64 = f64_mul(y_val64, quot_round_f64);
+          // Compute remainder: x - n * round(x/n)
           ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64);
         }
         break;
@@ -460,14 +472,14 @@
           // Perform division x/n
           float128_t div_result128;
           f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128);
-          // Compute floor of the division result
-          c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false);
-          float128_t floor_float128;
-          i64_to_f128M(floor_result128, &floor_float128);
-          // Multiply n by floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false);
+          float128_t quot_round_f128;
+          i64_to_f128M(quot_round128, &quot_round_f128);
+          // Multiply n by round(x/n)
           float128_t mult_result128;
-          f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128));
-          // Compute remainder: x - n * floor(x/n)
+          f128M_mul(((float128_t*)&y_val128), ((float128_t*)&quot_round_f128), ((float128_t*)&mult_result128));
+          // Compute remainder: x - n * round(x/n)
           f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i]));
         }
         break;
@@ -580,7 +592,7 @@
     c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y));
     u3r_bytes(0, syz_x, x_bytes, x_data);
 
-    c3_w min_idx = 0;
+    c3_d min_idx = 0;
 
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
@@ -589,7 +601,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) {
              min_val16 = ((float16_t*)x_bytes)[i];
-             min_idx = (len_x - i - 1);
+             min_idx = i;
            }
         }
         break;}
@@ -599,7 +611,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) {
              min_val32 = ((float32_t*)x_bytes)[i];
-             min_idx = (len_x - i - 1);
+             min_idx = i;
            }
         }
         break;}
@@ -609,7 +621,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) {
              min_val64 = ((float64_t*)x_bytes)[i];
-             min_idx = (len_x - i - 1);
+             min_idx = i;
            }
         }
         break;}
@@ -619,7 +631,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f128M_lt(&(((float128_t*)x_bytes)[i]), &min_val128)) {
              min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]);
-             min_idx = (len_x - i - 1);
+             min_idx = i;
            }
         }
         break;}
@@ -653,7 +665,7 @@
     c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y));
     u3r_bytes(0, syz_x, x_bytes, x_data);
 
-    c3_w max_idx = 0;
+    c3_d max_idx = 0;
 
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
@@ -662,7 +674,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) {
              max_val16 = ((float16_t*)x_bytes)[i];
-             max_idx = (len_x - i - 1);
+             max_idx = i;
            }
         }
         break;}
@@ -672,7 +684,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) {
              max_val32 = ((float32_t*)x_bytes)[i];
-             max_idx = (len_x - i - 1);
+             max_idx = i;
            }
         }
         break;}
@@ -682,7 +694,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) {
              max_val64 = ((float64_t*)x_bytes)[i];
-             max_idx = (len_x - i - 1);
+             max_idx = i;
            }
         }
         break;}
@@ -692,7 +704,7 @@
         for (c3_d i = 0; i < len_x; i++) {
            if(f128M_gt(&(((float128_t*)x_bytes)[i]), &max_val128)) {
              max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]);
-             max_idx = (len_x - i - 1);
+             max_idx = i;
            }
         }
         break;}
@@ -1055,7 +1067,7 @@
     return r_data;
   }
 
-/* gte - x > y
+/* gte - x >= y
 */
   u3_noun
   u3qi_la_gte_i754(u3_noun x_data,
@@ -1128,7 +1140,7 @@
     return r_data;
   }
 
-/* lth - x > y
+/* lth - x < y
 */
   u3_noun
   u3qi_la_lth_i754(u3_noun x_data,
@@ -1201,7 +1213,7 @@
     return r_data;
   }
 
-/* lte - x > y
+/* lte - x <= y
 */
   u3_noun
   u3qi_la_lte_i754(u3_noun x_data,
@@ -1314,7 +1326,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float16_t*)y_bytes)[i] = n16;
         }
-        haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1);
+        haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1, _la_rnd);
         break;
 
       case 5:
@@ -1323,7 +1335,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float32_t*)y_bytes)[i] = n32;
         }
-        saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1);
+        saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1, _la_rnd);
         break;
 
       case 6:
@@ -1332,7 +1344,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float64_t*)y_bytes)[i] = n64;
         }
-        daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1);
+        daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1, _la_rnd);
         break;
 
       case 7:
@@ -1341,7 +1353,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]};
         }
-        qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1);
+        qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1, _la_rnd);
         break;
     }
 
@@ -1396,7 +1408,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float16_t*)y_bytes)[i] = n16;
         }
-        haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1);
+        haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 5:
@@ -1405,7 +1417,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float32_t*)y_bytes)[i] = n32;
         }
-        saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1);
+        saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 6:
@@ -1414,7 +1426,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float64_t*)y_bytes)[i] = n64;
         }
-        daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1);
+        daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 7:
@@ -1423,7 +1435,7 @@
         for (c3_d i = 0; i < len_x; i++) {
           ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]};
         }
-        qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1);
+        qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1, _la_rnd);
         break;
     }
 
@@ -1473,22 +1485,22 @@
     switch (u3x_atom(bloq)) {
       case 4:
         u3r_bytes(0, 2, (c3_y*)&(n16.v), n);
-        hscal(len_x, n16, (float16_t*)x_bytes, 1);
+        hscal(len_x, n16, (float16_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 5:
         u3r_bytes(0, 4, (c3_y*)&(n32.v), n);
-        sscal(len_x, n32, (float32_t*)x_bytes, 1);
+        sscal(len_x, n32, (float32_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 6:
         u3r_bytes(0, 8, (c3_y*)&(n64.v), n);
-        dscal(len_x, n64, (float64_t*)x_bytes, 1);
+        dscal(len_x, n64, (float64_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 7:
         u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n);
-        qscal(len_x, n128, (float128_t*)x_bytes, 1);
+        qscal(len_x, n128, (float128_t*)x_bytes, 1, _la_rnd);
         break;
     }
 
@@ -1538,28 +1550,28 @@
         //  XX note that in16 is doing double duty here
         u3r_bytes(0, 2, (c3_y*)&(in16.v), n);
         in16 = f16_div((float16_t){SB_REAL16_ONE}, in16);
-        hscal(len_x, in16, (float16_t*)x_bytes, 1);
+        hscal(len_x, in16, (float16_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 5:
         //  XX note that in32 is doing double duty here
         u3r_bytes(0, 4, (c3_y*)&(in32.v), n);
         in32 = f32_div((float32_t){SB_REAL32_ONE}, in32);
-        sscal(len_x, in32, (float32_t*)x_bytes, 1);
+        sscal(len_x, in32, (float32_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 6:
         //  XX note that in64 is doing double duty here
         u3r_bytes(0, 8, (c3_y*)&(in64.v), n);
         in64 = f64_div((float64_t){SB_REAL64_ONE}, in64);
-        dscal(len_x, in64, (float64_t*)x_bytes, 1);
+        dscal(len_x, in64, (float64_t*)x_bytes, 1, _la_rnd);
         break;
 
       case 7:
         //  XX note that in128 is doing double duty here
         u3r_bytes(0, 16, (c3_y*)&(in128.v[0]), n);
         f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &in128, &in128);
-        qscal(len_x, in128, (float128_t*)x_bytes, 1);
+        qscal(len_x, in128, (float128_t*)x_bytes, 1, _la_rnd);
         break;
     }
 
@@ -1613,12 +1625,12 @@
           float16_t x_val16 = ((float16_t*)x_bytes)[i];
           // Perform division x/n
           float16_t div_result16 = f16_mul(in16, x_val16);
-          // Compute floor of the division result
-          c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false);
-          float16_t floor_float16 = i64_to_f16(floor_result16);
-          // Multiply n by floor(x/n)
-          float16_t mult_result16 = f16_mul(n16, floor_float16);
-          // Compute remainder: x - n * floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false);
+          float16_t quot_round_f16 = i64_to_f16(quot_round16);
+          // Multiply n by round(x/n)
+          float16_t mult_result16 = f16_mul(n16, quot_round_f16);
+          // Compute remainder: x - n * round(x/n)
           ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16);
         }
         break;
@@ -1630,13 +1642,13 @@
         for (c3_d i = 0; i < len_x; i++) {
           float32_t x_val32 = ((float32_t*)x_bytes)[i];
           // Perform division x/n
-          float32_t div_result32 = f32_mul(in32, x_val32);
-          // Compute floor of the division result
-          c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false);
-          float32_t floor_float32 = i64_to_f32(floor_result32);
-          // Multiply n by floor(x/n)
-          float32_t mult_result32 = f32_mul(n32, floor_float32);
-          // Compute remainder: x - n * floor(x/n)
+          float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32);
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false);
+          float32_t quot_round_f32 = i64_to_f32(quot_round32);
+          // Multiply n by round(x/n)
+          float32_t mult_result32 = f32_mul(n32, quot_round_f32);
+          // Compute remainder: x - n * round(x/n)
           ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32);
         }
         break;
@@ -1649,33 +1661,33 @@
           float64_t x_val64 = ((float64_t*)x_bytes)[i];
           // Perform division x/n
           float64_t div_result64 = f64_mul(in64, x_val64);
-          // Compute floor of the division result
-          c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false);
-          float64_t floor_float64 = i64_to_f64(floor_result64);
-          // Multiply n by floor(x/n)
-          float64_t mult_result64 = f64_mul(n64, floor_float64);
-          // Compute remainder: x - n * floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false);
+          float64_t quot_round_f64 = i64_to_f64(quot_round64);
+          // Multiply n by round(x/n)
+          float64_t mult_result64 = f64_mul(n64, quot_round_f64);
+          // Compute remainder: x - n * round(x/n)
           ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64);
         }
         break;
 
       case 7:
         u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n);
-        f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ZERO}), &n128, &in128);
+        f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &n128, &in128);
 
         for (c3_d i = 0; i < len_x; i++) {
           float128_t x_val128 = ((float128_t*)x_bytes)[i];
           // Perform division x/n
           float128_t div_result128;
           f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128);
-          // Compute floor of the division result
-          c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false);
-          float128_t floor_float128;
-          i64_to_f128M(floor_result128, &floor_float128);
-          // Multiply n by floor(x/n)
+          // Round the quotient using the active rounding mode (matches Hoon toi)
+          c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false);
+          float128_t quot_round_f128;
+          i64_to_f128M(quot_round128, &quot_round_f128);
+          // Multiply n by round(x/n)
           float128_t mult_result128;
-          f128M_mul(((float128_t*)&n128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128));
-          // Compute remainder: x - n * floor(x/n)
+          f128M_mul(((float128_t*)&n128), ((float128_t*)&quot_round_f128), ((float128_t*)&mult_result128));
+          // Compute remainder: x - n * round(x/n)
           f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i]));
         }
         break;
@@ -1724,28 +1736,28 @@
     switch (u3x_atom(bloq)) {
       case 4: {
         float16_t r16[2];
-        r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1);
+        r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1, _la_rnd);
         r16[1].v = 0x1;
         r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16);
         break;}
 
       case 5: {
         float32_t r32[2];
-        r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1);
+        r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1, _la_rnd);
         r32[1].v = 0x1;
         r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32);
         break;}
 
       case 6: {
         float64_t r64[2];
-        r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1);
+        r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1, _la_rnd);
         r64[1].v = 0x1;
         r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64);
         break;}
 
       case 7: {
         float128_t r128[2];
-        r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1);
+        r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1, _la_rnd);
         r128[1] = (float128_t){0x1, 0x0};
         r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128);
         break;}
@@ -1771,12 +1783,12 @@
     }
     //  Assert length of dims is 2.
     if (u3qb_lent(shape) != 2) {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     }
     //  Unpack shape into an array of dimensions.
     c3_d *dims = _get_dims(shape);
     if (dims[0] != dims[1]) {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     }
 
     //  Unpack the data as a byte array.  We assume total length < 2**64.
@@ -1818,7 +1830,7 @@
   {
     //  Assert length of dims is 2.
     if (u3qb_lent(shape) != 2) {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     }
     //  Unpack shape into an array of dimensions.
     c3_d *dims = _get_dims(shape);
@@ -1867,6 +1879,11 @@
       return u3_none;
     }
 
+    //  Guard degenerate count: n = 0 underflows n-1 and writes out of bounds.
+    if (n < 1) {
+      return u3_none;
+    }
+
     u3_noun r_data;
 
     switch (u3x_atom(bloq)) {
@@ -1971,7 +1988,9 @@
         u3r_bytes(0, 2, (c3_y*)&(a16.v), a);
         u3r_bytes(0, 2, (c3_y*)&(b16.v), b);
         u3r_bytes(0, 2, (c3_y*)&(interval16.v), d);
-        c3_d n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false);
+        c3_ds raw_n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false);
+        if ( raw_n16 < 1 || raw_n16 > 0xffffffff ) { return u3_none; }
+        c3_d n16 = raw_n16;
         c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2)*sizeof(c3_y));
         ((float16_t*)x_bytes16)[0] = a16;
         for (c3_d i = 1; i < n16; i++) {
@@ -1987,7 +2006,9 @@
         u3r_bytes(0, 4, (c3_y*)&(a32.v), a);
         u3r_bytes(0, 4, (c3_y*)&(b32.v), b);
         u3r_bytes(0, 4, (c3_y*)&(interval32.v), d);
-        c3_d n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false);
+        c3_ds raw_n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false);
+        if ( raw_n32 < 1 || raw_n32 > 0xffffffff ) { return u3_none; }
+        c3_d n32 = raw_n32;
         c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4)*sizeof(c3_y));
         ((float32_t*)x_bytes32)[0] = a32;
         for (c3_d i = 1; i < n32; i++) {
@@ -2003,7 +2024,9 @@
         u3r_bytes(0, 8, (c3_y*)&(a64.v), a);
         u3r_bytes(0, 8, (c3_y*)&(b64.v), b);
         u3r_bytes(0, 8, (c3_y*)&(interval64.v), d);
-        c3_d n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false);
+        c3_ds raw_n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false);
+        if ( raw_n64 < 1 || raw_n64 > 0xffffffff ) { return u3_none; }
+        c3_d n64 = raw_n64;
         c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8)*sizeof(c3_y));
         ((float64_t*)x_bytes64)[0] = a64;
         for (c3_d i = 1; i < n64; i++) {
@@ -2023,7 +2046,9 @@
         f128M_sub(&b128, &a128, &tmp);
         f128M_div(&tmp, &interval128, &tmp);
         f128M_ceil(&tmp, &tmp);
-        c3_d n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false);
+        c3_ds raw_n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false);
+        if ( raw_n128 < 1 || raw_n128 > 0xffffffff ) { return u3_none; }
+        c3_d n128 = raw_n128;
         c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16)*sizeof(c3_y));
         float128_t i128;
         ((float128_t*)x_bytes128)[0] = a128;
@@ -2050,7 +2075,9 @@
                      u3_noun bloq)
   {
     u3_noun d_data = u3qi_la_diag(x_data, shape, bloq);
-    c3_d len_x0 = _get_dims(shape)[0];
+    c3_d *dim_x = _get_dims(shape);
+    c3_d len_x0 = dim_x[0];
+    u3a_free(dim_x);
     u3_noun r_data = u3qi_la_dot_i754(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq));
     return r_data;
   }
@@ -2070,10 +2097,15 @@
     c3_d Nb= u3x_atom(u3h(y_shape));
     c3_d P = u3x_atom(u3h(u3t(y_shape)));
 
+    //  Fence on valid bloq size.
+    if (bloq < 4 || bloq > 7) {
+      return u3_none;
+    }
+
     if ((u3_nul != u3t(u3t(x_shape))) ||
         (u3_nul != u3t(u3t(y_shape))) ||
         (Na != Nb)) {
-      return u3m_bail(c3__exit);
+      return u3_none;
     }
     c3_d N = Na;
 
@@ -2115,19 +2147,19 @@
     //  Switch on the block size.
     switch (u3x_atom(bloq)) {
       case 4:
-        hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P);
+        hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P, _la_rnd);
         break;
 
       case 5:
-        sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P);
+        sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P, _la_rnd);
         break;
 
       case 6:
-        dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P);
+        dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P, _la_rnd);
         break;
 
       case 7:
-        qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P);
+        qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P, _la_rnd);
         break;
     }
 
@@ -2160,7 +2192,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2174,11 +2206,11 @@
            c3n == u3ud(rnd)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2207,7 +2239,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2221,11 +2253,11 @@
            c3n == u3ud(rnd)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2254,7 +2286,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2268,11 +2300,11 @@
            c3n == u3ud(rnd)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2301,7 +2333,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2315,11 +2347,11 @@
            c3n == u3ud(rnd)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2348,7 +2380,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2362,11 +2394,11 @@
            c3n == u3ud(rnd)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2390,7 +2422,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2404,11 +2436,11 @@
            c3n == _check(u3nc(x_meta, x_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_cumsum_i754(x_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2432,7 +2464,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2443,7 +2475,7 @@
            c3n == _check(u3nc(x_meta, x_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2470,7 +2502,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2481,7 +2513,7 @@
            c3n == _check(u3nc(x_meta, x_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2508,7 +2540,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2519,7 +2551,7 @@
            c3n == _check(u3nc(x_meta, x_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2546,7 +2578,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -2558,7 +2590,7 @@
            c3n == _check(u3nc(x_meta, x_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2585,7 +2617,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -2597,7 +2629,7 @@
            c3n == _check(u3nc(x_meta, x_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2624,7 +2656,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -2635,7 +2667,7 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2667,7 +2699,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2677,7 +2709,7 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2709,7 +2741,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2719,7 +2751,7 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2751,7 +2783,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2761,7 +2793,7 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2793,7 +2825,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind;
       x_shape = u3h(x_meta);          //  2
@@ -2803,7 +2835,7 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -2832,7 +2864,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(n) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2843,7 +2875,7 @@
       rnd = u3h(u3t(u3t(u3t(cor))));  // 30
       switch (x_kind) {
         case c3__i754:
-          _set_rounding(rnd);
+          _set_rounding_la(rnd);
           u3_noun r_data = u3qi_la_adds_i754(x_data, n, x_shape, x_bloq);
           if (r_data == u3_none) { return u3_none; }
           return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2868,7 +2900,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(n) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2879,7 +2911,7 @@
       rnd = u3h(u3t(u3t(u3t(cor))));  // 30
       switch (x_kind) {
         case c3__i754:
-          _set_rounding(rnd);
+          _set_rounding_la(rnd);
           u3_noun r_data = u3qi_la_subs_i754(x_data, n, x_shape, x_bloq);
           if (r_data == u3_none) { return u3_none; }
           return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2904,7 +2936,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(n) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2915,7 +2947,7 @@
       rnd = u3h(u3t(u3t(u3t(cor))));  // 30
       switch (x_kind) {
         case c3__i754:
-          _set_rounding(rnd);
+          _set_rounding_la(rnd);
           u3_noun r_data = u3qi_la_muls_i754(x_data, n, x_shape, x_bloq);
           if (r_data == u3_none) { return u3_none; }
           return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2940,7 +2972,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(n) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2951,7 +2983,7 @@
       rnd = u3h(u3t(u3t(u3t(cor))));  // 30
       switch (x_kind) {
         case c3__i754:
-          _set_rounding(rnd);
+          _set_rounding_la(rnd);
           u3_noun r_data = u3qi_la_divs_i754(x_data, n, x_shape, x_bloq);
           if (r_data == u3_none) { return u3_none; }
           return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -2976,7 +3008,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(n) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -2987,7 +3019,7 @@
       rnd = u3h(u3t(u3t(u3t(cor))));  // 30
       switch (x_kind) {
         case c3__i754:
-          _set_rounding(rnd);
+          _set_rounding_la(rnd);
           u3_noun r_data = u3qi_la_mods_i754(x_data, n, x_shape, x_bloq);
           if (r_data == u3_none) { return u3_none; }
           return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
@@ -3015,7 +3047,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail,
               rnd;
@@ -3028,14 +3060,16 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq);
             if (r_data == u3_none) { return u3_none; }
-            c3_d len_x0 = _get_dims(x_shape)[0];
+            c3_d *dim_x = _get_dims(x_shape);
+            c3_d len_x0 = dim_x[0];
+            u3a_free(dim_x);
             return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
 
           default:
@@ -3057,7 +3091,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -3069,7 +3103,7 @@
            c3n == _check(cor)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq);
         if (r_data == u3_none) { return u3_none; }
@@ -3090,7 +3124,7 @@
                          u3x_sam_7, &n,
                          0))
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -3104,11 +3138,11 @@
            (n < 1)                    // crash on zero size
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_linspace_i754(a, b, n, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             x_shape = u3nc(u3x_atom(n), u3_nul);
@@ -3133,7 +3167,7 @@
                          u3x_sam_7, &d,
                          0))
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -3145,34 +3179,40 @@
            c3n == u3ud(x_kind)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_range_i754(a, b, d, x_bloq);
             if (r_data == u3_none) { return u3_none; }
             c3_d a_, b_, d_;
             c3_ds n_;
             switch (x_bloq) {
-              case 4:
+              case 4: {
+                c3_d a_, b_, d_;
                 u3r_bytes(0, 2, (c3_y*)&a_, a);
                 u3r_bytes(0, 2, (c3_y*)&b_, b);
                 u3r_bytes(0, 2, (c3_y*)&d_, d);
                 n_ = f16_to_i64(f16_ceil(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_})), softfloat_round_minMag, false) - 1;
                 break;
-              case 5:
+              }
+              case 5: {
+                c3_d a_, b_, d_;
                 u3r_bytes(0, 4, (c3_y*)&a_, a);
                 u3r_bytes(0, 4, (c3_y*)&b_, b);
                 u3r_bytes(0, 4, (c3_y*)&d_, d);
                 n_ = f32_to_i64(f32_ceil(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_})), softfloat_round_minMag, false) - 1;
                 break;
-              case 6:
+              }
+              case 6: {
+                c3_d a_, b_, d_;
                 u3r_bytes(0, 8, (c3_y*)&a_, a);
                 u3r_bytes(0, 8, (c3_y*)&b_, b);
                 u3r_bytes(0, 8, (c3_y*)&d_, d);
                 n_ = f64_to_i64(f64_ceil(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_})), softfloat_round_minMag, false) - 1;
                 break;
+              }
               case 7: {
                 c3_d a__[2], b__[2], d__[2];
                 u3r_bytes(0, 16, (c3_y*)&a__, a);
@@ -3208,7 +3248,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       x_shape = u3h(x_meta);          //  2
@@ -3220,11 +3260,13 @@
            c3n == _check(cor)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq);
         if (r_data == u3_none) { return u3_none; }
-        c3_d len_x0 = _get_dims(x_shape)[0];
+        c3_d *dim_x = _get_dims(x_shape);
+        c3_d len_x0 = dim_x[0];
+        u3a_free(dim_x);
         return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);
       }
     }
@@ -3242,7 +3284,7 @@
                          0) ||
          c3n == u3ud(x_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind, x_tail;
       if ( c3n == u3r_mean(x_meta,
@@ -3253,7 +3295,7 @@
                             0)
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754: {
@@ -3284,7 +3326,7 @@
          c3n == u3ud(x_data) ||
          c3n == u3ud(y_data) )
     {
-      return u3m_bail(c3__exit);
+      u3m_bail(c3__exit);
     } else {
       u3_noun x_shape, x_bloq, x_kind,
               y_shape,
@@ -3298,11 +3340,11 @@
            c3n == _check(u3nc(y_meta, y_data))
          )
       {
-        return u3m_bail(c3__exit);
+        u3m_bail(c3__exit);
       } else {
         switch (x_kind) {
           case c3__i754:
-            _set_rounding(rnd);
+            _set_rounding_la(rnd);
             u3_noun r_data = u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq);
             // result is already [meta data]
             return r_data;

From 473ddd1dde8ec3066839c5b587b016ce31714c0b Mon Sep 17 00:00:00 2001
From: Sigilante <neal.davis@protonmail.com>
Date: Mon, 1 Jun 2026 09:04:07 -0500
Subject: [PATCH 2/4] ext/softblas: bump pin to 4e70272 (latest)

Bump the vendored SoftBLAS dependency from 40ff4fa to 4e70272
(urbit/SoftBLAS HEAD).  Notable upstream changes:

- macOS SoftFloat SIGBUS fix (use GCC platform.h for the Darwin build)
  -- directly relevant to building vere on macOS.
- drotmg NaN/Inf hang fix; C1-C8 NaN/Inf/stride determinism coverage.
- new complex/integer/quad Level-1 routines and a qrot signature fix
  (float16->float128); not compiled here and not called by the lagoon
  jet, so no jet changes.

The lagoon jets call only {s,d,h,q}{dot,axpy,scal} + {s,d,h,q}gemm,
none of which changed (only their tests did), so jetted behavior is
identical.  Updated build.zig.zon hash accordingly.  Full vere build
verified (zig 0.15.2, aarch64-macos).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ext/softblas/build.zig.zon | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/softblas/build.zig.zon b/ext/softblas/build.zig.zon
index df0a8c68aa..31b1f80a31 100644
--- a/ext/softblas/build.zig.zon
+++ b/ext/softblas/build.zig.zon
@@ -7,8 +7,8 @@
             .path = "../softfloat",
         },
         .softblas = .{
-            .url = "https://github.com/urbit/SoftBLAS/archive/40ff4fa0a3fba0a566ac5fcb8ab93d280fddf7af.tar.gz",
-            .hash = "N-V-__8AAObNNABARc0F3nVusDf1nQ4yxZH78h5A6HkWWAA9",
+            .url = "https://github.com/urbit/SoftBLAS/archive/4e702725445ca24d1fda5785b89c607aeb23e3eb.tar.gz",
+            .hash = "N-V-__8AAHqeNgCJpLl6qqnAsLF-YvRnkIFV2A1drRDlV7MH",
         },
     },
     .paths = .{

From 9027b3eef44700e9ed575490ca969e53cc095751 Mon Sep 17 00:00:00 2001
From: Sigilante <neal.davis@protonmail.com>
Date: Mon, 8 Jun 2026 20:32:55 -0500
Subject: [PATCH 3/4] noun: fix %mod jet to C-fmod truncation + NaN on
 divide-by-zero

The %mod ray-op (u3qi_la_mod_i754) and mod-scalar (u3qi_la_mods_i754)
jets rounded the quotient to nearest (IEEE remainder) and returned the
dividend on a zero divisor, diverging from the Hoon +mod (C fmod:
truncate toward zero; NaN on /0 or a non-finite operand).

Truncate the quotient with softfloat_round_minMag, and on a non-finite
quotient overwrite the result with the width's qNaN.  Both functions,
all four widths.  The 64-bit variant of this fix (PR #1022) was verified
on a live ship.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 pkg/noun/jets/i/lagoon.c | 56 ++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c
index 23a818188d..1d0c07e23f 100644
--- a/pkg/noun/jets/i/lagoon.c
+++ b/pkg/noun/jets/i/lagoon.c
@@ -423,13 +423,16 @@
           float16_t y_val16 = ((float16_t*)y_bytes)[i];
           // Perform division x/n
           float16_t div_result16 = f16_div(x_val16, y_val16);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_round_minMag, false);
           float16_t quot_round_f16 = i64_to_f16(quot_round16);
           // Multiply n by round(x/n)
           float16_t mult_result16 = f16_mul(y_val16, quot_round_f16);
           // Compute remainder: x - n * round(x/n)
           ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16);
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result16.v & 0x7c00) == 0x7c00 ) { ((float16_t*)y_bytes)[i] = (float16_t){ 0x7e00 }; }
         }
         break;
 
@@ -439,13 +442,16 @@
           float32_t y_val32 = ((float32_t*)y_bytes)[i];
           // Perform division x/n
           float32_t div_result32 = f32_div(x_val32, y_val32);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_round_minMag, false);
           float32_t quot_round_f32 = i64_to_f32(quot_round32);
           // Multiply n by round(x/n)
           float32_t mult_result32 = f32_mul(y_val32, quot_round_f32);
           // Compute remainder: x - n * round(x/n)
           ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32);
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result32.v & 0x7f800000) == 0x7f800000 ) { ((float32_t*)y_bytes)[i] = (float32_t){ 0x7fc00000 }; }
         }
         break;
 
@@ -455,13 +461,16 @@
           float64_t y_val64 = ((float64_t*)y_bytes)[i];
           // Perform division x/n
           float64_t div_result64 = f64_div(x_val64, y_val64);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_round_minMag, false);
           float64_t quot_round_f64 = i64_to_f64(quot_round64);
           // Multiply n by round(x/n)
           float64_t mult_result64 = f64_mul(y_val64, quot_round_f64);
           // Compute remainder: x - n * round(x/n)
           ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64);
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result64.v & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL ) { ((float64_t*)y_bytes)[i] = (float64_t){ 0x7ff8000000000000ULL }; }
         }
         break;
 
@@ -472,8 +481,8 @@
           // Perform division x/n
           float128_t div_result128;
           f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false);
           float128_t quot_round_f128;
           i64_to_f128M(quot_round128, &quot_round_f128);
           // Multiply n by round(x/n)
@@ -481,6 +490,9 @@
           f128M_mul(((float128_t*)&y_val128), ((float128_t*)&quot_round_f128), ((float128_t*)&mult_result128));
           // Compute remainder: x - n * round(x/n)
           f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i]));
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result128.v[1] & 0x7fff000000000000ULL) == 0x7fff000000000000ULL ) { ((float128_t*)y_bytes)[i] = (float128_t){{ 0, 0x7fff800000000000ULL }}; }
         }
         break;
     }
@@ -1625,13 +1637,16 @@
           float16_t x_val16 = ((float16_t*)x_bytes)[i];
           // Perform division x/n
           float16_t div_result16 = f16_mul(in16, x_val16);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_round_minMag, false);
           float16_t quot_round_f16 = i64_to_f16(quot_round16);
           // Multiply n by round(x/n)
           float16_t mult_result16 = f16_mul(n16, quot_round_f16);
           // Compute remainder: x - n * round(x/n)
           ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16);
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result16.v & 0x7c00) == 0x7c00 ) { ((float16_t*)x_bytes)[i] = (float16_t){ 0x7e00 }; }
         }
         break;
 
@@ -1643,13 +1658,16 @@
           float32_t x_val32 = ((float32_t*)x_bytes)[i];
           // Perform division x/n
           float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_round_minMag, false);
           float32_t quot_round_f32 = i64_to_f32(quot_round32);
           // Multiply n by round(x/n)
           float32_t mult_result32 = f32_mul(n32, quot_round_f32);
           // Compute remainder: x - n * round(x/n)
           ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32);
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result32.v & 0x7f800000) == 0x7f800000 ) { ((float32_t*)x_bytes)[i] = (float32_t){ 0x7fc00000 }; }
         }
         break;
 
@@ -1661,13 +1679,16 @@
           float64_t x_val64 = ((float64_t*)x_bytes)[i];
           // Perform division x/n
           float64_t div_result64 = f64_mul(in64, x_val64);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_round_minMag, false);
           float64_t quot_round_f64 = i64_to_f64(quot_round64);
           // Multiply n by round(x/n)
           float64_t mult_result64 = f64_mul(n64, quot_round_f64);
           // Compute remainder: x - n * round(x/n)
           ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64);
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result64.v & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL ) { ((float64_t*)x_bytes)[i] = (float64_t){ 0x7ff8000000000000ULL }; }
         }
         break;
 
@@ -1680,8 +1701,8 @@
           // Perform division x/n
           float128_t div_result128;
           f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128);
-          // Round the quotient using the active rounding mode (matches Hoon toi)
-          c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false);
+          // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod
+          c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false);
           float128_t quot_round_f128;
           i64_to_f128M(quot_round128, &quot_round_f128);
           // Multiply n by round(x/n)
@@ -1689,6 +1710,9 @@
           f128M_mul(((float128_t*)&n128), ((float128_t*)&quot_round_f128), ((float128_t*)&mult_result128));
           // Compute remainder: x - n * round(x/n)
           f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i]));
+          // x % 0 (or a non-finite operand) makes the quotient non-finite;
+          // return NaN to match the Hoon +mod, not the dividend.
+          if ( (div_result128.v[1] & 0x7fff000000000000ULL) == 0x7fff000000000000ULL ) { ((float128_t*)x_bytes)[i] = (float128_t){{ 0, 0x7fff800000000000ULL }}; }
         }
         break;
     }

From 22b09f21a0641f635068171860aa091cf8f35097 Mon Sep 17 00:00:00 2001
From: Sigilante <neal.davis@protonmail.com>
Date: Wed, 10 Jun 2026 15:45:53 -0500
Subject: [PATCH 4/4] noun: normalize @rq jet word-count to width constant n

Mirror the 64-bit fix: introduce the n word-count constant (VERE64 ? 2
: 4) and use it everywhere instead of literal 4. No behavior change on
the 32-bit build (n=4), but makes the file correct-by-construction if
c3_w widens, and keeps it in sync with the vere64 rq.c sub fix.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 pkg/noun/jets/e/rq.c | 62 ++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/pkg/noun/jets/e/rq.c b/pkg/noun/jets/e/rq.c
index e26a4de0af..b3bc91b147 100644
--- a/pkg/noun/jets/e/rq.c
+++ b/pkg/noun/jets/e/rq.c
@@ -13,6 +13,12 @@
     c3_w* c;
   };
 
+#ifdef VERE64
+  static const c3_w n = 2;
+#else
+  static const c3_w n = 4;
+#endif
+
   static inline c3_t
   _nan_test(float128_t* a)
   {
@@ -65,12 +71,12 @@
     d.c = alloca(16);
     e.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     f128M_add(c.q, d.q, e.q);
     _nan_unify(e.q);
 
-    u3_atom f = u3i_words(4, e.c);
+    u3_atom f = u3i_words(n, e.c);
     return f;
   }
 
@@ -103,12 +109,12 @@
     d.c = alloca(16);
     e.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     f128M_sub(c.q, d.q, e.q);
     _nan_unify(e.q);
 
-    u3_atom f = u3i_words(4, e.c);
+    u3_atom f = u3i_words(n, e.c);
     return f;
   }
 
@@ -141,12 +147,12 @@
     d.c = alloca(16);
     e.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     f128M_mul(c.q, d.q, e.q);
     _nan_unify(e.q);
 
-    u3_atom f = u3i_words(4, e.c);
+    u3_atom f = u3i_words(n, e.c);
     return f;
   }
 
@@ -179,12 +185,12 @@
     d.c = alloca(16);
     e.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     f128M_div(c.q, d.q, e.q);
     _nan_unify(e.q);
 
-    u3_atom f = u3i_words(4, e.c);
+    u3_atom f = u3i_words(n, e.c);
     return f;
   }
 
@@ -215,11 +221,11 @@
     c.c = alloca(16);
     d.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
+    u3r_words(0, n, c.c, a);
     f128M_sqrt(c.q, d.q);
     _nan_unify(d.q);
 
-    u3_atom e = u3i_words(4, d.c);
+    u3_atom e = u3i_words(n, d.c);
     return e;
   }
 
@@ -253,13 +259,13 @@
     f.c = alloca(16);
     g.c = alloca(16);
 
-    u3r_words(0, 4, d.c, a);
-    u3r_words(0, 4, e.c, b);
-    u3r_words(0, 4, f.c, c);
+    u3r_words(0, n, d.c, a);
+    u3r_words(0, n, e.c, b);
+    u3r_words(0, n, f.c, c);
     f128M_mulAdd(d.q, e.q, f.q, g.q);
     _nan_unify(g.q);
 
-    u3_atom h = u3i_words(4, g.c);
+    u3_atom h = u3i_words(n, g.c);
     return h;
   }
 
@@ -290,8 +296,8 @@
     c.c = alloca(16);
     d.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     c3_o e = __(f128M_lt(c.q, d.q));
 
     return e;
@@ -323,8 +329,8 @@
     c.c = alloca(16);
     d.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     c3_o e = __(f128M_le(c.q, d.q));
 
     return e;
@@ -356,8 +362,8 @@
     c.c = alloca(16);
     d.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     c3_o e = __(f128M_eq(c.q, d.q));
 
     return e;
@@ -389,8 +395,8 @@
     c.c = alloca(16);
     d.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     c3_o e = __(f128M_le(d.q, c.q));
 
     return e;
@@ -422,8 +428,8 @@
     c.c = alloca(16);
     d.c = alloca(16);
 
-    u3r_words(0, 4, c.c, a);
-    u3r_words(0, 4, d.c, b);
+    u3r_words(0, n, c.c, a);
+    u3r_words(0, n, d.c, b);
     c3_o e = __(f128M_lt(d.q, c.q));
 
     return e;