builders-lab · souls-syntax · Apr 30, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.gitignore b/.gitignore
@@ -15,7 +15,7 @@ check.txt
 [Dd]esktop.ini
 
 .venv/
-
+build_wsl
 # Recycle Bin used on file shares
 $RECYCLE.BIN/
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,10 +4,12 @@ project(soft-cuda LANGUAGES CXX CUDA)
 # Core library sources — explicitly exclude the Python bridge subdir
 file(GLOB_RECURSE LIB_SOURCE src/*.cu src/*.cpp src/*.hpp src/*.h)
 list(FILTER LIB_SOURCE EXCLUDE REGEX ".*/python/.*")
+list(FILTER LIB_SOURCE EXCLUDE REGEX ".*/profiler\.cu$")
 
 set(CMAKE_CUDA_ARCHITECTURES native)
 
 add_library(soft_lib SHARED ${LIB_SOURCE})
+target_link_libraries(soft_lib PRIVATE cublas)
 
 set_target_properties(soft_lib PROPERTIES
   CXX_STANDARD 17
@@ -19,6 +21,7 @@ target_include_directories(soft_lib
   PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/backend_cpu/include"
   PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include"
   PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src"
+  PRIVATE "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
 )
 
 target_compile_options(soft_lib PRIVATE
@@ -37,16 +40,16 @@ set_target_properties(soft_lib PROPERTIES
 # target_compile_options(soft_lib PUBLIC
 #   $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CXX>>:-fsanitize=address>
 # )
-target_compile_options(soft_lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g>)
+target_compile_options(soft_lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g --expt-relaxed-constexpr>)
 #
 # target_link_options(soft_lib PUBLIC
 #   $<$<CONFIG:Debug>:-fsanitize=address>
 # )
 
 
-# ─────────────────────────────────────────────────────────────────────────────
+
 # Python C bridge — flat-C shared library for use with ctypes / cffi
-# ─────────────────────────────────────────────────────────────────────────────
+
 add_library(soft_cuda_python SHARED src/python/sc_bridge.cpp)
 
 set_target_properties(soft_cuda_python PROPERTIES
@@ -65,11 +68,31 @@ target_include_directories(soft_cuda_python
 add_executable(soft main.cpp)
 target_link_libraries(soft_cuda_python PRIVATE soft_lib)
 target_link_libraries(soft PRIVATE soft_cuda_python)
-
 # Ensure all sc_* symbols are exported on Linux/macOS
 if(NOT WIN32)
   target_compile_options(soft_cuda_python PRIVATE -fvisibility=default)
 endif()
 
 enable_testing()
 add_subdirectory(tests)
+
+# HARDWARE PROFILER
+
+add_executable(soft_profiler src/init/config/profiler.cu)
+set_target_properties(soft_profiler PROPERTIES
+  CUDA_SEPARABLE_COMPILATION ON
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+)
+target_include_directories(soft_profiler
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src"
+)
+target_link_libraries(soft_profiler PRIVATE soft_lib)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/CMakeLists.txt")
+  add_subdirectory(benchmarks)
+endif()
+
+
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.16)
+
+# C++ benchmarks against soft_lib
+add_executable(bench_softcuda bench_softcuda.cpp)
+set_target_properties(bench_softcuda PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+)
+target_include_directories(bench_softcuda
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src/core/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src"
+)
+target_link_libraries(bench_softcuda PRIVATE soft_cuda_python)
+target_include_directories(bench_softcuda PRIVATE
+    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+)
+target_link_directories(bench_softcuda PRIVATE
+    "/opt/cuda/targets/x86_64-linux/lib"
+)
+target_link_libraries(bench_softcuda PRIVATE cublas)
+target_link_libraries(bench_softcuda PRIVATE cublas cudart)
+add_executable(bench_deep_mlp bench_deep_mlp.cpp)
+set_target_properties(bench_deep_mlp PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+)
+target_include_directories(bench_deep_mlp
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src/core/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src"
+)
+target_link_libraries(bench_deep_mlp PRIVATE soft_cuda_python)
diff --git a/benchmarks/bench_deep_mlp.cpp b/benchmarks/bench_deep_mlp.cpp
@@ -0,0 +1,137 @@
+/**
+ * bench_deep_mlp.cpp
+ *
+ * Implements a 4-layer Deep MLP to stress-test the Hybrid Dispatcher.
+ * Network:  Input (784) -> Hidden1 (512) -> Hidden2 (256) -> Hidden3 (128) -> Output (10)
+ *
+ * This benchmark demonstrates:
+ *   1. Correct routing of small vs large layers.
+ *   2. Persistence of GPU memory across 1000 iterations.
+ *   3. Hybrid execution benefit (dispatching compute-heavy layers to GPU,
+ *      and memory-bound/small layers to CPU).
+ */
+
+#include "soft-cuda/tensor/api.h"
+#include "soft-cuda/python/soft_cuda_python.h"
+
+#include <chrono>
+#include <cstdio>
+#include <vector>
+#include <cmath>
+#include <cassert>
+
+static double now_ms() {
+    using namespace std::chrono;
+    return (double)duration_cast<nanoseconds>(
+               high_resolution_clock::now().time_since_epoch())
+               .count() * 1e-6;
+}
+
+struct Layer {
+    sc_tensor_t *W;
+    sc_tensor_t *b;
+};
+
+static Layer create_layer(sc_pool_t *pool, uint32_t in_dim, uint32_t out_dim) {
+    uint32_t dW[] = {in_dim, out_dim};
+    uint32_t db[] = {1, out_dim};
+    sc_tensor_t *W = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW, NULL, 1);
+    sc_tensor_t *b = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db, NULL, 1);
+    sc_tensor_fill_random_normal(W, 0.0f, sqrtf(2.0f / (float)in_dim)); // He init
+    sc_tensor_fill_random_normal(b, 0.0f, 0.01f);
+    return {W, b};
+}
+
+static sc_tensor_t* forward_layer(sc_pool_t *pool, sc_tensor_t *X, Layer &L, bool use_relu = true) {
+    sc_tensor_t *mat = sc_tensor_mul_naive(pool, X, L.W);
+    sc_tensor_t *add = sc_tensor_add(pool, mat, L.b);
+    if (use_relu) return sc_tensor_relu(pool, add);
+    return add;
+}
+
+void run_mlp_bench(int backend_mode, const char *label, int epochs = 100) {
+    printf("--- Benchmarking %s ---\n", label);
+
+    const uint32_t BATCH = 64;
+    const uint32_t D_IN = 784, D1 = 512, D2 = 256, D3 = 128, D_OUT = 10;
+
+    sc_pool_t *pool = sc_pool_create(128 * 1024 * 1024, 0); // 128MB CPU
+    sc_pool_t *meta = sc_pool_create(8 * 1024 * 1024, 0);   // 8MB Meta
+    sc_pool_t *gpc  = sc_pool_create(32 * 1024 * 1024, 0);  // 32MB Grad CPU
+
+    bool use_gpu = (backend_mode != SC_BACKEND_CPU);
+    sc_pool_t *gpg  = sc_pool_create(use_gpu ? 32 * 1024 * 1024 : 1024, use_gpu ? 1 : 0);
+    sc_pool_t *pgpu = sc_pool_create(use_gpu ? 128 * 1024 * 1024 : 1024, use_gpu ? 1 : 0);
+
+    if (!pool || !meta || !gpc || !gpg || !pgpu) {
+        printf("FAILED to allocate pools (backend_mode=%d)\n", backend_mode);
+        if (pool) sc_pool_destroy(pool);
+        if (meta) sc_pool_destroy(meta);
+        if (gpc) sc_pool_destroy(gpc);
+        if (gpg) sc_pool_destroy(gpg);
+        if (pgpu) sc_pool_destroy(pgpu);
+        return;
+    }
+
+    uint32_t dX[] = {BATCH, D_IN};
+    uint32_t dY[] = {BATCH, D_OUT};
+    sc_tensor_t *X = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dX, NULL, 0);
+    sc_tensor_t *Y = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dY, NULL, 0);
+    sc_tensor_fill_random_normal(X, 0.5f, 0.2f);
+    sc_tensor_fill_random_normal(Y, 0.1f, 0.05f);
+
+    Layer L1 = create_layer(pool, D_IN, D1);
+    Layer L2 = create_layer(pool, D1, D2);
+    Layer L3 = create_layer(pool, D2, D3);
+    Layer L4 = create_layer(pool, D3, D_OUT);
+
+    // Build Graph
+    sc_tensor_t *H1 = forward_layer(pool, X, L1);
+    sc_tensor_t *H2 = forward_layer(pool, H1, L2);
+    sc_tensor_t *H3 = forward_layer(pool, H2, L3);
+    sc_tensor_t *Yp = forward_layer(pool, H3, L4, false);
+
+    sc_tensor_t *diff = sc_tensor_sub(pool, Yp, Y);
+    sc_tensor_t *sq   = sc_tensor_square(pool, diff);
+    sc_tensor_t *loss = sc_tensor_mean(pool, sq);
+
+    sc_graph_t *g = sc_build_graph(meta, pgpu, gpc, gpg, loss, backend_mode);
+    if (!g) {
+        printf("FAILED to build graph\n");
+        return;
+    }
+
+    // Warmup
+    sc_graph_step(pool, pgpu, g, 0.01f);
+
+    double t0 = now_ms();
+    for (int i = 0; i < epochs; i++) {
+        sc_graph_step(pool, pgpu, g, 0.01f);
+        if ((i+1) % (epochs/5) == 0) {
+             printf("  Epoch %d/%d | Loss: %.6f\n", i+1, epochs, sc_graph_get_loss(g));
+        }
+    }
+    double elapsed = now_ms() - t0;
+
+    printf("  [RESULT] Total: %.2f ms | Avg: %.2f ms/step\n\n", elapsed, elapsed / epochs);
+
+    sc_graph_destroy(g);
+    sc_pool_destroy(pool); sc_pool_destroy(meta);
+    sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+}
+
+int main() {
+    printf("\n--- Deep MLP Benchmark (4-Layer, Hybrid Dispatch) ---\n\n");
+
+    // CPU Reference
+    run_mlp_bench(SC_BACKEND_CPU, "CPU-Only Backend (Baseline)");
+
+    //  GPU Reference
+    run_mlp_bench(SC_BACKEND_GPU, "GPU-Only Backend (Full Acceleration)");
+
+    //  Hybrid Dispatch (AOT-Profiled)
+    // This will use thresholds from CONFIG.soft
+    run_mlp_bench(SC_BACKEND_HYBRID, "HYBRID Backend (AOT-Optimized)");
+
+    return 0;
+}