From 772b97e48d30ad97b1f05d3681461f3bafbbd0b0 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Sun, 31 May 2026 07:28:47 -0500 Subject: [PATCH 1/4] noun: migrate lagoon jet to stateless SoftBLAS Bump ext/softblas to urbit/SoftBLAS 40ff4fa (allocation-free; dropped the global softblas_roundingMode for a per-call rndMode arg; B1-B5 correctness fixes) and drop the now-deleted src/softblas_state.c from the wrapper. Update the lagoon jet for the new API: thread the active rounding mode through all 32 BLAS calls via a file-static _la_rnd (replacing the removed global), and rename the local _set_rounding -> _set_rounding_la to avoid colliding with softblas.h's new inline _set_rounding. The jet source matches numerics canonical (lagoon/vere). Built with zig 0.15.2; add/dot/mmul/mod verified correct on a fake ~hex. Co-Authored-By: Claude Opus 4.8 --- ext/softblas/build.zig | 1 - ext/softblas/build.zig.zon | 4 +- pkg/noun/jets/i/lagoon.c | 434 ++++++++++++++++++++----------------- 3 files changed, 240 insertions(+), 199 deletions(-) diff --git a/ext/softblas/build.zig b/ext/softblas/build.zig index 560e4977fc..2bf3e5e7eb 100644 --- a/ext/softblas/build.zig +++ b/ext/softblas/build.zig @@ -24,7 +24,6 @@ pub fn build(b: *std.Build) void { lib.addCSourceFiles(.{ .root = dep_c.path(""), .files = &.{ - "src/softblas_state.c", "src/blas/level1/sasum.c", "src/blas/level1/dasum.c", "src/blas/level1/hasum.c", diff --git a/ext/softblas/build.zig.zon b/ext/softblas/build.zig.zon index 8f657dfb98..df0a8c68aa 100644 --- a/ext/softblas/build.zig.zon +++ b/ext/softblas/build.zig.zon @@ -7,8 +7,8 @@ .path = "../softfloat", }, .softblas = .{ - .url = "https://github.com/urbit/SoftBLAS/archive/cbffb33f19ea02f9ffbd184d445123c57929ec53.tar.gz", - .hash = "N-V-__8AAJD5MgBhfBHYae8jFlcaQw9R-TRw4tcUFB3rO9-q", + .url = "https://github.com/urbit/SoftBLAS/archive/40ff4fa0a3fba0a566ac5fcb8ab93d280fddf7af.tar.gz", + .hash = "N-V-__8AAObNNABARc0F3nVusDf1nQ4yxZH78h5A6HkWWAA9", }, }, .paths = .{ diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index db7088b96e..23a818188d 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -37,9 +37,16 @@ c3_d c[2]; }; - // $?(%n %u %d %z %a) + // SoftBLAS is now stateless: rounding is passed per call. This holds + // the active mode (replaces the old softblas_roundingMode global). + static c3_y _la_rnd = 'n'; + + // Set the SoftFloat/SoftBLAS rounding mode from a rounding-mode atom. + // Accepts %n %u %d %z (see +$rounding-mode); %a (nearest, ties away) + // is handled too, but the hoon @rs/@rd/@rq/@rh doors never produce it. + // Any other value bails. static inline void - _set_rounding(c3_w a) + _set_rounding_la(c3_w a) { // We could use SoftBLAS set_rounding() to set the SoftFloat // mode as well, but it's more explicit to do it here since @@ -53,27 +60,27 @@ // %n - near case c3__n: softfloat_roundingMode = softfloat_round_near_even; - softblas_roundingMode = 'n'; + _la_rnd = 'n'; break; // %z - zero case c3__z: softfloat_roundingMode = softfloat_round_minMag; - softblas_roundingMode = 'z'; + _la_rnd = 'z'; break; // %u - up case c3__u: softfloat_roundingMode = softfloat_round_max; - softblas_roundingMode = 'u'; + _la_rnd = 'u'; break; // %d - down case c3__d: softfloat_roundingMode = softfloat_round_min; - softblas_roundingMode = 'd'; + _la_rnd = 'd'; break; // %a - away case c3__a: softfloat_roundingMode = softfloat_round_near_maxMag; - softblas_roundingMode = 'a'; + _la_rnd = 'a'; break; } } @@ -152,24 +159,25 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { case 4: - haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1, _la_rnd); break; case 5: - saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1, _la_rnd); break; case 6: - daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1, _la_rnd); break; case 7: - qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1, _la_rnd); break; } @@ -204,35 +212,36 @@ // syz_x is length in bytes c3_d syz_x = len_x * pow(2, bloq-3); - // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy; holds result) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 0x1; - // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); - - // Switch on the block size. + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, y_bytes, y_data); + + // Switch on the block size. Computes x_bytes := -1*y + x = x - y. switch (u3x_atom(bloq)) { case 4: - haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1, _la_rnd); break; case 5: - saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1, _la_rnd); break; case 6: - daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1, _la_rnd); break; case 7: - qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1, _la_rnd); break; } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); // Clean up and return. u3a_free(x_bytes); @@ -269,7 +278,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -335,7 +345,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -401,7 +412,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -411,12 +423,12 @@ float16_t y_val16 = ((float16_t*)y_bytes)[i]; // Perform division x/n float16_t div_result16 = f16_div(x_val16, y_val16); - // Compute floor of the division result - c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); - float16_t floor_float16 = i64_to_f16(floor_result16); - // Multiply n by floor(x/n) - float16_t mult_result16 = f16_mul(y_val16, floor_float16); - // Compute remainder: x - n * floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false); + float16_t quot_round_f16 = i64_to_f16(quot_round16); + // Multiply n by round(x/n) + float16_t mult_result16 = f16_mul(y_val16, quot_round_f16); + // Compute remainder: x - n * round(x/n) ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16); } break; @@ -427,12 +439,12 @@ float32_t y_val32 = ((float32_t*)y_bytes)[i]; // Perform division x/n float32_t div_result32 = f32_div(x_val32, y_val32); - // Compute floor of the division result - c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); - float32_t floor_float32 = i64_to_f32(floor_result32); - // Multiply n by floor(x/n) - float32_t mult_result32 = f32_mul(y_val32, floor_float32); - // Compute remainder: x - n * floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false); + float32_t quot_round_f32 = i64_to_f32(quot_round32); + // Multiply n by round(x/n) + float32_t mult_result32 = f32_mul(y_val32, quot_round_f32); + // Compute remainder: x - n * round(x/n) ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32); } break; @@ -443,12 +455,12 @@ float64_t y_val64 = ((float64_t*)y_bytes)[i]; // Perform division x/n float64_t div_result64 = f64_div(x_val64, y_val64); - // Compute floor of the division result - c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); - float64_t floor_float64 = i64_to_f64(floor_result64); - // Multiply n by floor(x/n) - float64_t mult_result64 = f64_mul(y_val64, floor_float64); - // Compute remainder: x - n * floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false); + float64_t quot_round_f64 = i64_to_f64(quot_round64); + // Multiply n by round(x/n) + float64_t mult_result64 = f64_mul(y_val64, quot_round_f64); + // Compute remainder: x - n * round(x/n) ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64); } break; @@ -460,14 +472,14 @@ // Perform division x/n float128_t div_result128; f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); - // Compute floor of the division result - c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); - float128_t floor_float128; - i64_to_f128M(floor_result128, &floor_float128); - // Multiply n by floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false); + float128_t quot_round_f128; + i64_to_f128M(quot_round128, "_round_f128); + // Multiply n by round(x/n) float128_t mult_result128; - f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); - // Compute remainder: x - n * floor(x/n) + f128M_mul(((float128_t*)&y_val128), ((float128_t*)"_round_f128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * round(x/n) f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); } break; @@ -580,7 +592,7 @@ c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); u3r_bytes(0, syz_x, x_bytes, x_data); - c3_w min_idx = 0; + c3_d min_idx = 0; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -589,7 +601,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) { min_val16 = ((float16_t*)x_bytes)[i]; - min_idx = (len_x - i - 1); + min_idx = i; } } break;} @@ -599,7 +611,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) { min_val32 = ((float32_t*)x_bytes)[i]; - min_idx = (len_x - i - 1); + min_idx = i; } } break;} @@ -609,7 +621,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) { min_val64 = ((float64_t*)x_bytes)[i]; - min_idx = (len_x - i - 1); + min_idx = i; } } break;} @@ -619,7 +631,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f128M_lt(&(((float128_t*)x_bytes)[i]), &min_val128)) { min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); - min_idx = (len_x - i - 1); + min_idx = i; } } break;} @@ -653,7 +665,7 @@ c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); u3r_bytes(0, syz_x, x_bytes, x_data); - c3_w max_idx = 0; + c3_d max_idx = 0; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -662,7 +674,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) { max_val16 = ((float16_t*)x_bytes)[i]; - max_idx = (len_x - i - 1); + max_idx = i; } } break;} @@ -672,7 +684,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) { max_val32 = ((float32_t*)x_bytes)[i]; - max_idx = (len_x - i - 1); + max_idx = i; } } break;} @@ -682,7 +694,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) { max_val64 = ((float64_t*)x_bytes)[i]; - max_idx = (len_x - i - 1); + max_idx = i; } } break;} @@ -692,7 +704,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f128M_gt(&(((float128_t*)x_bytes)[i]), &max_val128)) { max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); - max_idx = (len_x - i - 1); + max_idx = i; } } break;} @@ -1055,7 +1067,7 @@ return r_data; } -/* gte - x > y +/* gte - x >= y */ u3_noun u3qi_la_gte_i754(u3_noun x_data, @@ -1128,7 +1140,7 @@ return r_data; } -/* lth - x > y +/* lth - x < y */ u3_noun u3qi_la_lth_i754(u3_noun x_data, @@ -1201,7 +1213,7 @@ return r_data; } -/* lte - x > y +/* lte - x <= y */ u3_noun u3qi_la_lte_i754(u3_noun x_data, @@ -1314,7 +1326,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = n16; } - haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1, _la_rnd); break; case 5: @@ -1323,7 +1335,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = n32; } - saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1, _la_rnd); break; case 6: @@ -1332,7 +1344,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = n64; } - daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1, _la_rnd); break; case 7: @@ -1341,7 +1353,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; } - qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1, _la_rnd); break; } @@ -1396,7 +1408,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = n16; } - haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1); + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1, _la_rnd); break; case 5: @@ -1405,7 +1417,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = n32; } - saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1); + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1, _la_rnd); break; case 6: @@ -1414,7 +1426,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = n64; } - daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1); + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1, _la_rnd); break; case 7: @@ -1423,7 +1435,7 @@ for (c3_d i = 0; i < len_x; i++) { ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; } - qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1); + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1, _la_rnd); break; } @@ -1473,22 +1485,22 @@ switch (u3x_atom(bloq)) { case 4: u3r_bytes(0, 2, (c3_y*)&(n16.v), n); - hscal(len_x, n16, (float16_t*)x_bytes, 1); + hscal(len_x, n16, (float16_t*)x_bytes, 1, _la_rnd); break; case 5: u3r_bytes(0, 4, (c3_y*)&(n32.v), n); - sscal(len_x, n32, (float32_t*)x_bytes, 1); + sscal(len_x, n32, (float32_t*)x_bytes, 1, _la_rnd); break; case 6: u3r_bytes(0, 8, (c3_y*)&(n64.v), n); - dscal(len_x, n64, (float64_t*)x_bytes, 1); + dscal(len_x, n64, (float64_t*)x_bytes, 1, _la_rnd); break; case 7: u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); - qscal(len_x, n128, (float128_t*)x_bytes, 1); + qscal(len_x, n128, (float128_t*)x_bytes, 1, _la_rnd); break; } @@ -1538,28 +1550,28 @@ // XX note that in16 is doing double duty here u3r_bytes(0, 2, (c3_y*)&(in16.v), n); in16 = f16_div((float16_t){SB_REAL16_ONE}, in16); - hscal(len_x, in16, (float16_t*)x_bytes, 1); + hscal(len_x, in16, (float16_t*)x_bytes, 1, _la_rnd); break; case 5: // XX note that in32 is doing double duty here u3r_bytes(0, 4, (c3_y*)&(in32.v), n); in32 = f32_div((float32_t){SB_REAL32_ONE}, in32); - sscal(len_x, in32, (float32_t*)x_bytes, 1); + sscal(len_x, in32, (float32_t*)x_bytes, 1, _la_rnd); break; case 6: // XX note that in64 is doing double duty here u3r_bytes(0, 8, (c3_y*)&(in64.v), n); in64 = f64_div((float64_t){SB_REAL64_ONE}, in64); - dscal(len_x, in64, (float64_t*)x_bytes, 1); + dscal(len_x, in64, (float64_t*)x_bytes, 1, _la_rnd); break; case 7: // XX note that in128 is doing double duty here u3r_bytes(0, 16, (c3_y*)&(in128.v[0]), n); f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &in128, &in128); - qscal(len_x, in128, (float128_t*)x_bytes, 1); + qscal(len_x, in128, (float128_t*)x_bytes, 1, _la_rnd); break; } @@ -1613,12 +1625,12 @@ float16_t x_val16 = ((float16_t*)x_bytes)[i]; // Perform division x/n float16_t div_result16 = f16_mul(in16, x_val16); - // Compute floor of the division result - c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); - float16_t floor_float16 = i64_to_f16(floor_result16); - // Multiply n by floor(x/n) - float16_t mult_result16 = f16_mul(n16, floor_float16); - // Compute remainder: x - n * floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false); + float16_t quot_round_f16 = i64_to_f16(quot_round16); + // Multiply n by round(x/n) + float16_t mult_result16 = f16_mul(n16, quot_round_f16); + // Compute remainder: x - n * round(x/n) ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16); } break; @@ -1630,13 +1642,13 @@ for (c3_d i = 0; i < len_x; i++) { float32_t x_val32 = ((float32_t*)x_bytes)[i]; // Perform division x/n - float32_t div_result32 = f32_mul(in32, x_val32); - // Compute floor of the division result - c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); - float32_t floor_float32 = i64_to_f32(floor_result32); - // Multiply n by floor(x/n) - float32_t mult_result32 = f32_mul(n32, floor_float32); - // Compute remainder: x - n * floor(x/n) + float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false); + float32_t quot_round_f32 = i64_to_f32(quot_round32); + // Multiply n by round(x/n) + float32_t mult_result32 = f32_mul(n32, quot_round_f32); + // Compute remainder: x - n * round(x/n) ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32); } break; @@ -1649,33 +1661,33 @@ float64_t x_val64 = ((float64_t*)x_bytes)[i]; // Perform division x/n float64_t div_result64 = f64_mul(in64, x_val64); - // Compute floor of the division result - c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); - float64_t floor_float64 = i64_to_f64(floor_result64); - // Multiply n by floor(x/n) - float64_t mult_result64 = f64_mul(n64, floor_float64); - // Compute remainder: x - n * floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false); + float64_t quot_round_f64 = i64_to_f64(quot_round64); + // Multiply n by round(x/n) + float64_t mult_result64 = f64_mul(n64, quot_round_f64); + // Compute remainder: x - n * round(x/n) ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64); } break; case 7: u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); - f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ZERO}), &n128, &in128); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &n128, &in128); for (c3_d i = 0; i < len_x; i++) { float128_t x_val128 = ((float128_t*)x_bytes)[i]; // Perform division x/n float128_t div_result128; f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); - // Compute floor of the division result - c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); - float128_t floor_float128; - i64_to_f128M(floor_result128, &floor_float128); - // Multiply n by floor(x/n) + // Round the quotient using the active rounding mode (matches Hoon toi) + c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false); + float128_t quot_round_f128; + i64_to_f128M(quot_round128, "_round_f128); + // Multiply n by round(x/n) float128_t mult_result128; - f128M_mul(((float128_t*)&n128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); - // Compute remainder: x - n * floor(x/n) + f128M_mul(((float128_t*)&n128), ((float128_t*)"_round_f128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * round(x/n) f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i])); } break; @@ -1724,28 +1736,28 @@ switch (u3x_atom(bloq)) { case 4: { float16_t r16[2]; - r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1, _la_rnd); r16[1].v = 0x1; r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); break;} case 5: { float32_t r32[2]; - r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1, _la_rnd); r32[1].v = 0x1; r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); break;} case 6: { float64_t r64[2]; - r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1, _la_rnd); r64[1].v = 0x1; r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); break;} case 7: { float128_t r128[2]; - r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1, _la_rnd); r128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); break;} @@ -1771,12 +1783,12 @@ } // Assert length of dims is 2. if (u3qb_lent(shape) != 2) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); if (dims[0] != dims[1]) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } // Unpack the data as a byte array. We assume total length < 2**64. @@ -1818,7 +1830,7 @@ { // Assert length of dims is 2. if (u3qb_lent(shape) != 2) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); @@ -1867,6 +1879,11 @@ return u3_none; } + // Guard degenerate count: n = 0 underflows n-1 and writes out of bounds. + if (n < 1) { + return u3_none; + } + u3_noun r_data; switch (u3x_atom(bloq)) { @@ -1971,7 +1988,9 @@ u3r_bytes(0, 2, (c3_y*)&(a16.v), a); u3r_bytes(0, 2, (c3_y*)&(b16.v), b); u3r_bytes(0, 2, (c3_y*)&(interval16.v), d); - c3_d n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false); + c3_ds raw_n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false); + if ( raw_n16 < 1 || raw_n16 > 0xffffffff ) { return u3_none; } + c3_d n16 = raw_n16; c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2)*sizeof(c3_y)); ((float16_t*)x_bytes16)[0] = a16; for (c3_d i = 1; i < n16; i++) { @@ -1987,7 +2006,9 @@ u3r_bytes(0, 4, (c3_y*)&(a32.v), a); u3r_bytes(0, 4, (c3_y*)&(b32.v), b); u3r_bytes(0, 4, (c3_y*)&(interval32.v), d); - c3_d n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false); + c3_ds raw_n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false); + if ( raw_n32 < 1 || raw_n32 > 0xffffffff ) { return u3_none; } + c3_d n32 = raw_n32; c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4)*sizeof(c3_y)); ((float32_t*)x_bytes32)[0] = a32; for (c3_d i = 1; i < n32; i++) { @@ -2003,7 +2024,9 @@ u3r_bytes(0, 8, (c3_y*)&(a64.v), a); u3r_bytes(0, 8, (c3_y*)&(b64.v), b); u3r_bytes(0, 8, (c3_y*)&(interval64.v), d); - c3_d n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false); + c3_ds raw_n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false); + if ( raw_n64 < 1 || raw_n64 > 0xffffffff ) { return u3_none; } + c3_d n64 = raw_n64; c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8)*sizeof(c3_y)); ((float64_t*)x_bytes64)[0] = a64; for (c3_d i = 1; i < n64; i++) { @@ -2023,7 +2046,9 @@ f128M_sub(&b128, &a128, &tmp); f128M_div(&tmp, &interval128, &tmp); f128M_ceil(&tmp, &tmp); - c3_d n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false); + c3_ds raw_n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false); + if ( raw_n128 < 1 || raw_n128 > 0xffffffff ) { return u3_none; } + c3_d n128 = raw_n128; c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16)*sizeof(c3_y)); float128_t i128; ((float128_t*)x_bytes128)[0] = a128; @@ -2050,7 +2075,9 @@ u3_noun bloq) { u3_noun d_data = u3qi_la_diag(x_data, shape, bloq); - c3_d len_x0 = _get_dims(shape)[0]; + c3_d *dim_x = _get_dims(shape); + c3_d len_x0 = dim_x[0]; + u3a_free(dim_x); u3_noun r_data = u3qi_la_dot_i754(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); return r_data; } @@ -2070,10 +2097,15 @@ c3_d Nb= u3x_atom(u3h(y_shape)); c3_d P = u3x_atom(u3h(u3t(y_shape))); + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + if ((u3_nul != u3t(u3t(x_shape))) || (u3_nul != u3t(u3t(y_shape))) || (Na != Nb)) { - return u3m_bail(c3__exit); + return u3_none; } c3_d N = Na; @@ -2115,19 +2147,19 @@ // Switch on the block size. switch (u3x_atom(bloq)) { case 4: - hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P); + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P, _la_rnd); break; case 5: - sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P); + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P, _la_rnd); break; case 6: - dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P); + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P, _la_rnd); break; case 7: - qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P); + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P, _la_rnd); break; } @@ -2160,7 +2192,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2174,11 +2206,11 @@ c3n == u3ud(rnd) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2207,7 +2239,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2221,11 +2253,11 @@ c3n == u3ud(rnd) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2254,7 +2286,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2268,11 +2300,11 @@ c3n == u3ud(rnd) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2301,7 +2333,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2315,11 +2347,11 @@ c3n == u3ud(rnd) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2348,7 +2380,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2362,11 +2394,11 @@ c3n == u3ud(rnd) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2390,7 +2422,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2404,11 +2436,11 @@ c3n == _check(u3nc(x_meta, x_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_cumsum_i754(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2432,7 +2464,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2443,7 +2475,7 @@ c3n == _check(u3nc(x_meta, x_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2470,7 +2502,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2481,7 +2513,7 @@ c3n == _check(u3nc(x_meta, x_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2508,7 +2540,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2519,7 +2551,7 @@ c3n == _check(u3nc(x_meta, x_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2546,7 +2578,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -2558,7 +2590,7 @@ c3n == _check(u3nc(x_meta, x_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2585,7 +2617,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -2597,7 +2629,7 @@ c3n == _check(u3nc(x_meta, x_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2624,7 +2656,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -2635,7 +2667,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2667,7 +2699,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2677,7 +2709,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2709,7 +2741,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2719,7 +2751,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2751,7 +2783,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2761,7 +2793,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2793,7 +2825,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 @@ -2803,7 +2835,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -2832,7 +2864,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2843,7 +2875,7 @@ rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_adds_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2868,7 +2900,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2879,7 +2911,7 @@ rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_subs_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2904,7 +2936,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2915,7 +2947,7 @@ rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_muls_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2940,7 +2972,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2951,7 +2983,7 @@ rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_divs_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -2976,7 +3008,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -2987,7 +3019,7 @@ rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_mods_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); @@ -3015,7 +3047,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; @@ -3028,14 +3060,16 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - c3_d len_x0 = _get_dims(x_shape)[0]; + c3_d *dim_x = _get_dims(x_shape); + c3_d len_x0 = dim_x[0]; + u3a_free(dim_x); return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: @@ -3057,7 +3091,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -3069,7 +3103,7 @@ c3n == _check(cor) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } @@ -3090,7 +3124,7 @@ u3x_sam_7, &n, 0)) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -3104,11 +3138,11 @@ (n < 1) // crash on zero size ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_linspace_i754(a, b, n, x_bloq); if (r_data == u3_none) { return u3_none; } x_shape = u3nc(u3x_atom(n), u3_nul); @@ -3133,7 +3167,7 @@ u3x_sam_7, &d, 0)) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -3145,34 +3179,40 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_range_i754(a, b, d, x_bloq); if (r_data == u3_none) { return u3_none; } c3_d a_, b_, d_; c3_ds n_; switch (x_bloq) { - case 4: + case 4: { + c3_d a_, b_, d_; u3r_bytes(0, 2, (c3_y*)&a_, a); u3r_bytes(0, 2, (c3_y*)&b_, b); u3r_bytes(0, 2, (c3_y*)&d_, d); n_ = f16_to_i64(f16_ceil(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_})), softfloat_round_minMag, false) - 1; break; - case 5: + } + case 5: { + c3_d a_, b_, d_; u3r_bytes(0, 4, (c3_y*)&a_, a); u3r_bytes(0, 4, (c3_y*)&b_, b); u3r_bytes(0, 4, (c3_y*)&d_, d); n_ = f32_to_i64(f32_ceil(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_})), softfloat_round_minMag, false) - 1; break; - case 6: + } + case 6: { + c3_d a_, b_, d_; u3r_bytes(0, 8, (c3_y*)&a_, a); u3r_bytes(0, 8, (c3_y*)&b_, b); u3r_bytes(0, 8, (c3_y*)&d_, d); n_ = f64_to_i64(f64_ceil(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_})), softfloat_round_minMag, false) - 1; break; + } case 7: { c3_d a__[2], b__[2], d__[2]; u3r_bytes(0, 16, (c3_y*)&a__, a); @@ -3208,7 +3248,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 @@ -3220,11 +3260,13 @@ c3n == _check(cor) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - c3_d len_x0 = _get_dims(x_shape)[0]; + c3_d *dim_x = _get_dims(x_shape); + c3_d len_x0 = dim_x[0]; + u3a_free(dim_x); return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); } } @@ -3242,7 +3284,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_tail; if ( c3n == u3r_mean(x_meta, @@ -3253,7 +3295,7 @@ 0) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { @@ -3284,7 +3326,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, y_shape, @@ -3298,11 +3340,11 @@ c3n == _check(u3nc(y_meta, y_data)) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: - _set_rounding(rnd); + _set_rounding_la(rnd); u3_noun r_data = u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq); // result is already [meta data] return r_data; From 473ddd1dde8ec3066839c5b587b016ce31714c0b Mon Sep 17 00:00:00 2001 From: Sigilante Date: Mon, 1 Jun 2026 09:04:07 -0500 Subject: [PATCH 2/4] ext/softblas: bump pin to 4e70272 (latest) Bump the vendored SoftBLAS dependency from 40ff4fa to 4e70272 (urbit/SoftBLAS HEAD). Notable upstream changes: - macOS SoftFloat SIGBUS fix (use GCC platform.h for the Darwin build) -- directly relevant to building vere on macOS. - drotmg NaN/Inf hang fix; C1-C8 NaN/Inf/stride determinism coverage. - new complex/integer/quad Level-1 routines and a qrot signature fix (float16->float128); not compiled here and not called by the lagoon jet, so no jet changes. The lagoon jets call only {s,d,h,q}{dot,axpy,scal} + {s,d,h,q}gemm, none of which changed (only their tests did), so jetted behavior is identical. Updated build.zig.zon hash accordingly. Full vere build verified (zig 0.15.2, aarch64-macos). Co-Authored-By: Claude Opus 4.8 --- ext/softblas/build.zig.zon | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/softblas/build.zig.zon b/ext/softblas/build.zig.zon index df0a8c68aa..31b1f80a31 100644 --- a/ext/softblas/build.zig.zon +++ b/ext/softblas/build.zig.zon @@ -7,8 +7,8 @@ .path = "../softfloat", }, .softblas = .{ - .url = "https://github.com/urbit/SoftBLAS/archive/40ff4fa0a3fba0a566ac5fcb8ab93d280fddf7af.tar.gz", - .hash = "N-V-__8AAObNNABARc0F3nVusDf1nQ4yxZH78h5A6HkWWAA9", + .url = "https://github.com/urbit/SoftBLAS/archive/4e702725445ca24d1fda5785b89c607aeb23e3eb.tar.gz", + .hash = "N-V-__8AAHqeNgCJpLl6qqnAsLF-YvRnkIFV2A1drRDlV7MH", }, }, .paths = .{ From 9027b3eef44700e9ed575490ca969e53cc095751 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Mon, 8 Jun 2026 20:32:55 -0500 Subject: [PATCH 3/4] noun: fix %mod jet to C-fmod truncation + NaN on divide-by-zero The %mod ray-op (u3qi_la_mod_i754) and mod-scalar (u3qi_la_mods_i754) jets rounded the quotient to nearest (IEEE remainder) and returned the dividend on a zero divisor, diverging from the Hoon +mod (C fmod: truncate toward zero; NaN on /0 or a non-finite operand). Truncate the quotient with softfloat_round_minMag, and on a non-finite quotient overwrite the result with the width's qNaN. Both functions, all four widths. The 64-bit variant of this fix (PR #1022) was verified on a live ship. Co-Authored-By: Claude Opus 4.8 --- pkg/noun/jets/i/lagoon.c | 56 ++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 23a818188d..1d0c07e23f 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -423,13 +423,16 @@ float16_t y_val16 = ((float16_t*)y_bytes)[i]; // Perform division x/n float16_t div_result16 = f16_div(x_val16, y_val16); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_round_minMag, false); float16_t quot_round_f16 = i64_to_f16(quot_round16); // Multiply n by round(x/n) float16_t mult_result16 = f16_mul(y_val16, quot_round_f16); // Compute remainder: x - n * round(x/n) ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result16.v & 0x7c00) == 0x7c00 ) { ((float16_t*)y_bytes)[i] = (float16_t){ 0x7e00 }; } } break; @@ -439,13 +442,16 @@ float32_t y_val32 = ((float32_t*)y_bytes)[i]; // Perform division x/n float32_t div_result32 = f32_div(x_val32, y_val32); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_round_minMag, false); float32_t quot_round_f32 = i64_to_f32(quot_round32); // Multiply n by round(x/n) float32_t mult_result32 = f32_mul(y_val32, quot_round_f32); // Compute remainder: x - n * round(x/n) ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result32.v & 0x7f800000) == 0x7f800000 ) { ((float32_t*)y_bytes)[i] = (float32_t){ 0x7fc00000 }; } } break; @@ -455,13 +461,16 @@ float64_t y_val64 = ((float64_t*)y_bytes)[i]; // Perform division x/n float64_t div_result64 = f64_div(x_val64, y_val64); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_round_minMag, false); float64_t quot_round_f64 = i64_to_f64(quot_round64); // Multiply n by round(x/n) float64_t mult_result64 = f64_mul(y_val64, quot_round_f64); // Compute remainder: x - n * round(x/n) ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result64.v & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL ) { ((float64_t*)y_bytes)[i] = (float64_t){ 0x7ff8000000000000ULL }; } } break; @@ -472,8 +481,8 @@ // Perform division x/n float128_t div_result128; f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); float128_t quot_round_f128; i64_to_f128M(quot_round128, "_round_f128); // Multiply n by round(x/n) @@ -481,6 +490,9 @@ f128M_mul(((float128_t*)&y_val128), ((float128_t*)"_round_f128), ((float128_t*)&mult_result128)); // Compute remainder: x - n * round(x/n) f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result128.v[1] & 0x7fff000000000000ULL) == 0x7fff000000000000ULL ) { ((float128_t*)y_bytes)[i] = (float128_t){{ 0, 0x7fff800000000000ULL }}; } } break; } @@ -1625,13 +1637,16 @@ float16_t x_val16 = ((float16_t*)x_bytes)[i]; // Perform division x/n float16_t div_result16 = f16_mul(in16, x_val16); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round16 = f16_to_i64(div_result16, softfloat_round_minMag, false); float16_t quot_round_f16 = i64_to_f16(quot_round16); // Multiply n by round(x/n) float16_t mult_result16 = f16_mul(n16, quot_round_f16); // Compute remainder: x - n * round(x/n) ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result16.v & 0x7c00) == 0x7c00 ) { ((float16_t*)x_bytes)[i] = (float16_t){ 0x7e00 }; } } break; @@ -1643,13 +1658,16 @@ float32_t x_val32 = ((float32_t*)x_bytes)[i]; // Perform division x/n float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round32 = f32_to_i64(div_result32, softfloat_round_minMag, false); float32_t quot_round_f32 = i64_to_f32(quot_round32); // Multiply n by round(x/n) float32_t mult_result32 = f32_mul(n32, quot_round_f32); // Compute remainder: x - n * round(x/n) ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result32.v & 0x7f800000) == 0x7f800000 ) { ((float32_t*)x_bytes)[i] = (float32_t){ 0x7fc00000 }; } } break; @@ -1661,13 +1679,16 @@ float64_t x_val64 = ((float64_t*)x_bytes)[i]; // Perform division x/n float64_t div_result64 = f64_mul(in64, x_val64); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round64 = f64_to_i64(div_result64, softfloat_round_minMag, false); float64_t quot_round_f64 = i64_to_f64(quot_round64); // Multiply n by round(x/n) float64_t mult_result64 = f64_mul(n64, quot_round_f64); // Compute remainder: x - n * round(x/n) ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result64.v & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL ) { ((float64_t*)x_bytes)[i] = (float64_t){ 0x7ff8000000000000ULL }; } } break; @@ -1680,8 +1701,8 @@ // Perform division x/n float128_t div_result128; f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); - // Round the quotient using the active rounding mode (matches Hoon toi) - c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_roundingMode, false); + // Truncate the quotient toward zero (round_minMag): C fmod, matching +mod + c3_ds quot_round128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); float128_t quot_round_f128; i64_to_f128M(quot_round128, "_round_f128); // Multiply n by round(x/n) @@ -1689,6 +1710,9 @@ f128M_mul(((float128_t*)&n128), ((float128_t*)"_round_f128), ((float128_t*)&mult_result128)); // Compute remainder: x - n * round(x/n) f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i])); + // x % 0 (or a non-finite operand) makes the quotient non-finite; + // return NaN to match the Hoon +mod, not the dividend. + if ( (div_result128.v[1] & 0x7fff000000000000ULL) == 0x7fff000000000000ULL ) { ((float128_t*)x_bytes)[i] = (float128_t){{ 0, 0x7fff800000000000ULL }}; } } break; } From 22b09f21a0641f635068171860aa091cf8f35097 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 10 Jun 2026 15:45:53 -0500 Subject: [PATCH 4/4] noun: normalize @rq jet word-count to width constant n Mirror the 64-bit fix: introduce the n word-count constant (VERE64 ? 2 : 4) and use it everywhere instead of literal 4. No behavior change on the 32-bit build (n=4), but makes the file correct-by-construction if c3_w widens, and keeps it in sync with the vere64 rq.c sub fix. Co-Authored-By: Claude Opus 4.8 --- pkg/noun/jets/e/rq.c | 62 ++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/pkg/noun/jets/e/rq.c b/pkg/noun/jets/e/rq.c index e26a4de0af..b3bc91b147 100644 --- a/pkg/noun/jets/e/rq.c +++ b/pkg/noun/jets/e/rq.c @@ -13,6 +13,12 @@ c3_w* c; }; +#ifdef VERE64 + static const c3_w n = 2; +#else + static const c3_w n = 4; +#endif + static inline c3_t _nan_test(float128_t* a) { @@ -65,12 +71,12 @@ d.c = alloca(16); e.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); f128M_add(c.q, d.q, e.q); _nan_unify(e.q); - u3_atom f = u3i_words(4, e.c); + u3_atom f = u3i_words(n, e.c); return f; } @@ -103,12 +109,12 @@ d.c = alloca(16); e.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); f128M_sub(c.q, d.q, e.q); _nan_unify(e.q); - u3_atom f = u3i_words(4, e.c); + u3_atom f = u3i_words(n, e.c); return f; } @@ -141,12 +147,12 @@ d.c = alloca(16); e.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); f128M_mul(c.q, d.q, e.q); _nan_unify(e.q); - u3_atom f = u3i_words(4, e.c); + u3_atom f = u3i_words(n, e.c); return f; } @@ -179,12 +185,12 @@ d.c = alloca(16); e.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); f128M_div(c.q, d.q, e.q); _nan_unify(e.q); - u3_atom f = u3i_words(4, e.c); + u3_atom f = u3i_words(n, e.c); return f; } @@ -215,11 +221,11 @@ c.c = alloca(16); d.c = alloca(16); - u3r_words(0, 4, c.c, a); + u3r_words(0, n, c.c, a); f128M_sqrt(c.q, d.q); _nan_unify(d.q); - u3_atom e = u3i_words(4, d.c); + u3_atom e = u3i_words(n, d.c); return e; } @@ -253,13 +259,13 @@ f.c = alloca(16); g.c = alloca(16); - u3r_words(0, 4, d.c, a); - u3r_words(0, 4, e.c, b); - u3r_words(0, 4, f.c, c); + u3r_words(0, n, d.c, a); + u3r_words(0, n, e.c, b); + u3r_words(0, n, f.c, c); f128M_mulAdd(d.q, e.q, f.q, g.q); _nan_unify(g.q); - u3_atom h = u3i_words(4, g.c); + u3_atom h = u3i_words(n, g.c); return h; } @@ -290,8 +296,8 @@ c.c = alloca(16); d.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); c3_o e = __(f128M_lt(c.q, d.q)); return e; @@ -323,8 +329,8 @@ c.c = alloca(16); d.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); c3_o e = __(f128M_le(c.q, d.q)); return e; @@ -356,8 +362,8 @@ c.c = alloca(16); d.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); c3_o e = __(f128M_eq(c.q, d.q)); return e; @@ -389,8 +395,8 @@ c.c = alloca(16); d.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); c3_o e = __(f128M_le(d.q, c.q)); return e; @@ -422,8 +428,8 @@ c.c = alloca(16); d.c = alloca(16); - u3r_words(0, 4, c.c, a); - u3r_words(0, 4, d.c, b); + u3r_words(0, n, c.c, a); + u3r_words(0, n, d.c, b); c3_o e = __(f128M_lt(d.q, c.q)); return e;