diff --git a/.gitignore b/.gitignore
index eaa7d52..5ecc666 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,7 @@ check.txt
 [Dd]esktop.ini
 
 .venv/
-
+build_wsl
 # Recycle Bin used on file shares
 $RECYCLE.BIN/
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 972d242..a50c672 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,10 +4,12 @@ project(soft-cuda LANGUAGES CXX CUDA)
 # Core library sources — explicitly exclude the Python bridge subdir
 file(GLOB_RECURSE LIB_SOURCE src/*.cu src/*.cpp src/*.hpp src/*.h)
 list(FILTER LIB_SOURCE EXCLUDE REGEX ".*/python/.*")
+list(FILTER LIB_SOURCE EXCLUDE REGEX ".*/profiler\.cu$")
 
 set(CMAKE_CUDA_ARCHITECTURES native)
 
 add_library(soft_lib SHARED ${LIB_SOURCE})
+target_link_libraries(soft_lib PRIVATE cublas)
 
 set_target_properties(soft_lib PROPERTIES
   CXX_STANDARD 17
@@ -19,6 +21,7 @@ target_include_directories(soft_lib
   PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/backend_cpu/include"
   PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include"
   PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src"
+  PRIVATE "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
 )
 
 target_compile_options(soft_lib PRIVATE
@@ -37,16 +40,16 @@ set_target_properties(soft_lib PROPERTIES
 # target_compile_options(soft_lib PUBLIC
 #   $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CXX>>:-fsanitize=address>
 # )
-target_compile_options(soft_lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g>)
+target_compile_options(soft_lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g --expt-relaxed-constexpr>)
 #
 # target_link_options(soft_lib PUBLIC
 #   $<$<CONFIG:Debug>:-fsanitize=address>
 # )
 
 
-# ─────────────────────────────────────────────────────────────────────────────
+
 # Python C bridge — flat-C shared library for use with ctypes / cffi
-# ─────────────────────────────────────────────────────────────────────────────
+
 add_library(soft_cuda_python SHARED src/python/sc_bridge.cpp)
 
 set_target_properties(soft_cuda_python PROPERTIES
@@ -65,7 +68,6 @@ target_include_directories(soft_cuda_python
 add_executable(soft main.cpp)
 target_link_libraries(soft_cuda_python PRIVATE soft_lib)
 target_link_libraries(soft PRIVATE soft_cuda_python)
-
 # Ensure all sc_* symbols are exported on Linux/macOS
 if(NOT WIN32)
   target_compile_options(soft_cuda_python PRIVATE -fvisibility=default)
@@ -73,3 +75,24 @@ endif()
 
 enable_testing()
 add_subdirectory(tests)
+
+# HARDWARE PROFILER
+
+add_executable(soft_profiler src/init/config/profiler.cu)
+set_target_properties(soft_profiler PROPERTIES
+  CUDA_SEPARABLE_COMPILATION ON
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+)
+target_include_directories(soft_profiler
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src"
+)
+target_link_libraries(soft_profiler PRIVATE soft_lib)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/CMakeLists.txt")
+  add_subdirectory(benchmarks)
+endif()
+
+
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..60be91f
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.16)
+
+# C++ benchmarks against soft_lib
+add_executable(bench_softcuda bench_softcuda.cpp)
+set_target_properties(bench_softcuda PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+)
+target_include_directories(bench_softcuda
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src/core/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src"
+)
+target_link_libraries(bench_softcuda PRIVATE soft_cuda_python)
+target_include_directories(bench_softcuda PRIVATE
+    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+)
+target_link_directories(bench_softcuda PRIVATE
+    "/opt/cuda/targets/x86_64-linux/lib"
+)
+target_link_libraries(bench_softcuda PRIVATE cublas)
+target_link_libraries(bench_softcuda PRIVATE cublas cudart)
+add_executable(bench_deep_mlp bench_deep_mlp.cpp)
+set_target_properties(bench_deep_mlp PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+)
+target_include_directories(bench_deep_mlp
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src/core/include"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src"
+)
+target_link_libraries(bench_deep_mlp PRIVATE soft_cuda_python)
diff --git a/benchmarks/bench_deep_mlp.cpp b/benchmarks/bench_deep_mlp.cpp
new file mode 100644
index 0000000..b772aeb
--- /dev/null
+++ b/benchmarks/bench_deep_mlp.cpp
@@ -0,0 +1,137 @@
+/**
+ * bench_deep_mlp.cpp
+ *
+ * Implements a 4-layer Deep MLP to stress-test the Hybrid Dispatcher.
+ * Network:  Input (784) -> Hidden1 (512) -> Hidden2 (256) -> Hidden3 (128) -> Output (10)
+ *
+ * This benchmark demonstrates:
+ *   1. Correct routing of small vs large layers.
+ *   2. Persistence of GPU memory across 1000 iterations.
+ *   3. Hybrid execution benefit (dispatching compute-heavy layers to GPU,
+ *      and memory-bound/small layers to CPU).
+ */
+
+#include "soft-cuda/tensor/api.h"
+#include "soft-cuda/python/soft_cuda_python.h"
+
+#include <chrono>
+#include <cstdio>
+#include <vector>
+#include <cmath>
+#include <cassert>
+
+static double now_ms() {
+    using namespace std::chrono;
+    return (double)duration_cast<nanoseconds>(
+               high_resolution_clock::now().time_since_epoch())
+               .count() * 1e-6;
+}
+
+struct Layer {
+    sc_tensor_t *W;
+    sc_tensor_t *b;
+};
+
+static Layer create_layer(sc_pool_t *pool, uint32_t in_dim, uint32_t out_dim) {
+    uint32_t dW[] = {in_dim, out_dim};
+    uint32_t db[] = {1, out_dim};
+    sc_tensor_t *W = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW, NULL, 1);
+    sc_tensor_t *b = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db, NULL, 1);
+    sc_tensor_fill_random_normal(W, 0.0f, sqrtf(2.0f / (float)in_dim)); // He init
+    sc_tensor_fill_random_normal(b, 0.0f, 0.01f);
+    return {W, b};
+}
+
+static sc_tensor_t* forward_layer(sc_pool_t *pool, sc_tensor_t *X, Layer &L, bool use_relu = true) {
+    sc_tensor_t *mat = sc_tensor_mul_naive(pool, X, L.W);
+    sc_tensor_t *add = sc_tensor_add(pool, mat, L.b);
+    if (use_relu) return sc_tensor_relu(pool, add);
+    return add;
+}
+
+void run_mlp_bench(int backend_mode, const char *label, int epochs = 100) {
+    printf("--- Benchmarking %s ---\n", label);
+
+    const uint32_t BATCH = 64;
+    const uint32_t D_IN = 784, D1 = 512, D2 = 256, D3 = 128, D_OUT = 10;
+
+    sc_pool_t *pool = sc_pool_create(128 * 1024 * 1024, 0); // 128MB CPU
+    sc_pool_t *meta = sc_pool_create(8 * 1024 * 1024, 0);   // 8MB Meta
+    sc_pool_t *gpc  = sc_pool_create(32 * 1024 * 1024, 0);  // 32MB Grad CPU
+    
+    bool use_gpu = (backend_mode != SC_BACKEND_CPU);
+    sc_pool_t *gpg  = sc_pool_create(use_gpu ? 32 * 1024 * 1024 : 1024, use_gpu ? 1 : 0);
+    sc_pool_t *pgpu = sc_pool_create(use_gpu ? 128 * 1024 * 1024 : 1024, use_gpu ? 1 : 0);
+
+    if (!pool || !meta || !gpc || !gpg || !pgpu) {
+        printf("FAILED to allocate pools (backend_mode=%d)\n", backend_mode);
+        if (pool) sc_pool_destroy(pool);
+        if (meta) sc_pool_destroy(meta);
+        if (gpc) sc_pool_destroy(gpc);
+        if (gpg) sc_pool_destroy(gpg);
+        if (pgpu) sc_pool_destroy(pgpu);
+        return;
+    }
+
+    uint32_t dX[] = {BATCH, D_IN};
+    uint32_t dY[] = {BATCH, D_OUT};
+    sc_tensor_t *X = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dX, NULL, 0);
+    sc_tensor_t *Y = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dY, NULL, 0);
+    sc_tensor_fill_random_normal(X, 0.5f, 0.2f);
+    sc_tensor_fill_random_normal(Y, 0.1f, 0.05f);
+
+    Layer L1 = create_layer(pool, D_IN, D1);
+    Layer L2 = create_layer(pool, D1, D2);
+    Layer L3 = create_layer(pool, D2, D3);
+    Layer L4 = create_layer(pool, D3, D_OUT);
+
+    // Build Graph
+    sc_tensor_t *H1 = forward_layer(pool, X, L1);
+    sc_tensor_t *H2 = forward_layer(pool, H1, L2);
+    sc_tensor_t *H3 = forward_layer(pool, H2, L3);
+    sc_tensor_t *Yp = forward_layer(pool, H3, L4, false);
+    
+    sc_tensor_t *diff = sc_tensor_sub(pool, Yp, Y);
+    sc_tensor_t *sq   = sc_tensor_square(pool, diff);
+    sc_tensor_t *loss = sc_tensor_mean(pool, sq);
+
+    sc_graph_t *g = sc_build_graph(meta, pgpu, gpc, gpg, loss, backend_mode);
+    if (!g) {
+        printf("FAILED to build graph\n");
+        return;
+    }
+
+    // Warmup
+    sc_graph_step(pool, pgpu, g, 0.01f);
+
+    double t0 = now_ms();
+    for (int i = 0; i < epochs; i++) {
+        sc_graph_step(pool, pgpu, g, 0.01f);
+        if ((i+1) % (epochs/5) == 0) {
+             printf("  Epoch %d/%d | Loss: %.6f\n", i+1, epochs, sc_graph_get_loss(g));
+        }
+    }
+    double elapsed = now_ms() - t0;
+
+    printf("  [RESULT] Total: %.2f ms | Avg: %.2f ms/step\n\n", elapsed, elapsed / epochs);
+
+    sc_graph_destroy(g);
+    sc_pool_destroy(pool); sc_pool_destroy(meta);
+    sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+}
+
+int main() {
+    printf("\n--- Deep MLP Benchmark (4-Layer, Hybrid Dispatch) ---\n\n");
+
+    // CPU Reference
+    run_mlp_bench(SC_BACKEND_CPU, "CPU-Only Backend (Baseline)");
+
+    //  GPU Reference
+    run_mlp_bench(SC_BACKEND_GPU, "GPU-Only Backend (Full Acceleration)");
+
+    //  Hybrid Dispatch (AOT-Profiled)
+    // This will use thresholds from CONFIG.soft
+    run_mlp_bench(SC_BACKEND_HYBRID, "HYBRID Backend (AOT-Optimized)");
+
+    return 0;
+}
diff --git a/benchmarks/bench_pytorch.py b/benchmarks/bench_pytorch.py
new file mode 100644
index 0000000..4d6310b
--- /dev/null
+++ b/benchmarks/bench_pytorch.py
@@ -0,0 +1,183 @@
+"""
+bench_pytorch.py
+================
+Benchmarks PyTorch (CPU) for the same operations as bench_softcuda.cpp so we
+can make an apples-to-apples CPU comparison.
+
+Operations benchmarked:
+  1. Element-wise ADD  (1M float32)
+  2. Matmul 512×512
+  3. XOR training (10 000 epochs, 2-layer MLP, SGD, CPU)
+
+Run:
+    cd /home/wslarch/Documents/projects/soft-cuda
+    source .venv/bin/activate
+    python3 benchmarks/bench_pytorch.py
+"""
+
+import time
+import torch
+import torch.nn as nn
+import torch.optim as optim
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"  Running on: {torch.cuda.get_device_name(0)}\n")
+
+REPS = 20
+
+def hline():
+    print("─" * 65)
+
+def result(label, ms, gflops=None):
+    if gflops:
+        print(f"  {label:<34}  {ms:8.2f} ms   {gflops:6.2f} GFLOPs")
+    else:
+        print(f"  {label:<34}  {ms:8.2f} ms")
+
+# Element-wise ADD
+hline()
+print("  Benchmark 1: Element-wise ADD  (1M float32)  [PyTorch CPU]")
+hline()
+
+N = 1024 * 1024
+a = torch.ones(N, device=device)
+b = torch.ones(N, device=device)
+
+times = []
+for _ in range(REPS):
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    c = a + b
+    torch.cuda.synchronize()
+    times.append((time.perf_counter() - t0) * 1e3)
+
+result("ADD [PyTorch CPU]", sum(times)/len(times), N * 1e-9)
+print()
+
+# Matmul 512×512
+hline()
+print("  Benchmark 2: Matmul 4096×4096  [PyTorch GPU]")
+hline()
+
+M, K, Np = 4096, 4096, 4096
+A = torch.randn(M, K, device=device)
+B = torch.randn(K, Np, device=device)
+total_flops = 2 * M * K * Np
+
+times = []
+for _ in range(REPS):
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    C = torch.mm(A, B)
+    torch.cuda.synchronize()
+    times.append((time.perf_counter() - t0) * 1e3)
+
+result("Matmul 512×512 [PyTorch OpenBLAS]",
+       sum(times)/len(times), total_flops * 1e-9)
+print()
+
+# XOR training
+hline()
+print("  Benchmark 3: XOR Training  (10 000 epochs, SGD lr=0.05, CPU)")
+hline()
+
+X = torch.tensor([[0,0],[0,1],[1,0],[1,1]], dtype=torch.float32)
+Y = torch.tensor([[0],[1],[1],[0]], dtype=torch.float32)
+
+class XORNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(2, 4)
+        self.l2 = nn.Linear(4, 1)
+    def forward(self, x):
+        return self.l2(torch.relu(self.l1(x)))
+
+X = X.to(device)
+Y = Y.to(device)
+model = XORNet().to(device)
+opt = optim.SGD(model.parameters(), lr=0.05)
+loss_fn = nn.MSELoss()
+
+EPOCHS = 10000
+t0 = time.perf_counter()
+for _ in range(EPOCHS):
+    opt.zero_grad()
+    pred = model(X)
+    loss = loss_fn(pred, Y)
+    loss.backward()
+    opt.step()
+elapsed = (time.perf_counter() - t0) * 1e3
+
+result(f"XOR training [{EPOCHS} epochs, PyTorch CPU]", elapsed)
+print(f"  Final loss: {loss.item():.6f}")
+print()
+
+# 100-Million Element Reduction 
+hline()
+print("  Benchmark 5: 100-Million Element Reduction (Mean)  [PyTorch CPU]")
+hline()
+
+N_billion = 100 * 1000 * 1000
+print("  [Allocating 400MB Host Memory...]")
+# Use torch.ones to mimic the C++ benchmark
+A_billion = torch.ones(N_billion, dtype=torch.float32, device=device)
+
+times = []
+for _ in range(5):
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    M = A_billion.mean()
+    torch.cuda.synchronize()
+    times.append((time.perf_counter() - t0) * 1e3)
+
+result("Mean 100M [PyTorch CPU]", sum(times)/len(times))
+print(f"  (Mean={M.item():.1f})")
+print()
+
+hline()
+print("  Benchmark 6: Deep MLP (4-Layer, 100 epochs, SGD)")
+hline()
+
+class DeepMLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(128, 256), nn.ReLU(),
+            nn.Linear(256, 256), nn.ReLU(),
+            nn.Linear(256, 128), nn.ReLU(),
+            nn.Linear(128, 1)
+        )
+    def forward(self, x):
+        return self.net(x)
+
+Xm = torch.randn(1024, 128, device=device)
+Ym = torch.randn(1024, 1, device=device)
+mlp = DeepMLP().to(device)
+opt_mlp = optim.SGD(mlp.parameters(), lr=0.01)
+loss_fn2 = nn.MSELoss()
+
+# warmup
+for _ in range(5):
+    opt_mlp.zero_grad()
+    loss_fn2(mlp(Xm), Ym).backward()
+    opt_mlp.step()
+
+torch.cuda.synchronize()
+t0 = time.perf_counter()
+for epoch in range(1, 101):
+    opt_mlp.zero_grad()
+    pred = mlp(Xm)
+    loss = loss_fn2(pred, Ym)
+    loss.backward()
+    opt_mlp.step()
+    if epoch % 20 == 0:
+        torch.cuda.synchronize()
+        print(f"  Epoch {epoch}/100 | Loss: {loss.item():.6f}")
+torch.cuda.synchronize()
+elapsed = (time.perf_counter() - t0) * 1e3
+result("Deep MLP [PyTorch GPU]", elapsed)
+result("Per epoch", elapsed / 100)
+print()
+
+hline()
+print("  For GPU comparison run bench_softcuda with SC_BACKEND_GPU.")
+hline()
diff --git a/benchmarks/bench_softcuda.cpp b/benchmarks/bench_softcuda.cpp
new file mode 100644
index 0000000..e9d27bb
--- /dev/null
+++ b/benchmarks/bench_softcuda.cpp
@@ -0,0 +1,388 @@
+/* Build:  (benchmarks/CMakeLists.txt)
+ * Run:    ./bench_softcuda
+ */
+
+#include "soft-cuda/tensor/api.h"
+#include "soft-cuda/python/soft_cuda_python.h"
+#include <cublas_v2.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+
+static double now_ms() {
+    using namespace std::chrono;
+    return (double)duration_cast<nanoseconds>(
+               high_resolution_clock::now().time_since_epoch())
+               .count() * 1e-6;
+}
+
+static void hline() { printf("─────────────────────────────────────────────────────────────────\n"); }
+static void header(const char *s) {
+    hline();
+    printf("  %s\n", s);
+    hline();
+}
+static void result(const char *label, double ms, double ops_billions) {
+    if (ops_billions > 0)
+        printf("  %-30s  %8.2f ms   %6.2f GFLOPs\n", label, ms, ops_billions / ms * 1e3);
+    else
+        printf("  %-30s  %8.2f ms\n", label, ms);
+}
+
+static void bench_add() {
+    header("Benchmark 1: Element-wise ADD  (1M float32)");
+
+    const uint32_t N = 1024 * 1024;
+    float *data_a = new float[N];
+    float *data_b = new float[N];
+    for (uint32_t i = 0; i < N; i++) { data_a[i] = (float)i * 0.001f; data_b[i] = 1.0f; }
+
+    uint32_t dims[] = {N};
+    const int REPS = 20;
+
+    {
+        sc_pool_t *pool = sc_pool_create((size_t)N * 4 * 10, 0);
+        sc_pool_t *meta = sc_pool_create(2*1024*1024, 0);
+        sc_pool_t *gpc  = sc_pool_create((size_t)N * 4 * 2, 0);
+        sc_pool_t *gpg  = sc_pool_create(2*1024*1024, 0);  /* dummy */
+        sc_pool_t *pgpu = sc_pool_create(2*1024*1024, 0);  /* dummy CPU pool */
+
+        sc_tensor_t *a = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 1, dims, data_a, 0);
+        sc_tensor_t *b = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 1, dims, data_b, 0);
+        sc_tensor_t *c = sc_tensor_add(pool, a, b);
+        sc_graph_t *g  = sc_build_graph(meta, pgpu, gpc, gpg, c, SC_BACKEND_CPU);
+
+        double total = 0;
+        for (int r = 0; r < REPS; r++) {
+            double t0 = now_ms();
+            sc_graph_forward(pool, pgpu, g);
+            total += now_ms() - t0;
+        }
+        sc_graph_destroy(g);
+
+        result("ADD [CPU]", total / REPS, (double)N * 1e-9);
+        sc_pool_destroy(pool); sc_pool_destroy(meta);
+        sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+    }
+
+    {
+        sc_pool_t *pool = sc_pool_create((size_t)N * 4 * 10, 0);
+        sc_pool_t *meta = sc_pool_create(2*1024*1024, 0);
+        sc_pool_t *gpc  = sc_pool_create((size_t)N * 4 * 2, 0);
+        sc_pool_t *gpg  = sc_pool_create((size_t)N * 4 * 2, 1);
+        sc_pool_t *pgpu = sc_pool_create((size_t)N * 4 * 10, 1);
+
+        sc_tensor_t *a = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 1, dims, data_a, 0);
+        sc_tensor_t *b = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 1, dims, data_b, 0);
+        sc_tensor_t *c = sc_tensor_add(pool, a, b);
+        sc_graph_t *g  = sc_build_graph(meta, pgpu, gpc, gpg, c, SC_BACKEND_GPU);
+
+        double total = 0;
+        for (int r = 0; r < REPS; r++) {
+            double t0 = now_ms();
+            sc_graph_forward(pool, pgpu, g);
+            total += now_ms() - t0;
+        }
+        sc_graph_destroy(g);
+        result("ADD [GPU]", total / REPS, (double)N * 1e-9);
+        sc_pool_destroy(pool); sc_pool_destroy(meta);
+        sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+    }
+
+    delete[] data_a; delete[] data_b;
+    printf("\n");
+}
+
+static void bench_matmul() {
+    header("Benchmark 2: Matmul 4096×4096");
+
+    const uint32_t M = 4096, K = 4096, N = 4096;
+    const uint32_t total_flops = 2 * M * K * N;  
+    float *A = new float[M * K];
+    float *B = new float[K * N];
+    for (uint32_t i = 0; i < M*K; i++) A[i] = (float)i * 0.001f;
+    for (uint32_t i = 0; i < K*N; i++) B[i] = (float)i * 0.001f;
+
+    uint32_t dA[] = {M, K}, dB[] = {K, N};
+    const int REPS = 10;
+
+    auto run = [&](int backend_flag, const char *label) {
+        bool gpu = (backend_flag == SC_BACKEND_GPU);
+        size_t pool_sz = (size_t)(M*K + K*N + M*N) * 4 * 4;
+        size_t vram_sz = gpu ? pool_sz : 1024;
+
+        sc_pool_t *pool = sc_pool_create(pool_sz, 0);
+        sc_pool_t *meta = sc_pool_create(2*1024*1024, 0);
+        sc_pool_t *gpc  = sc_pool_create(2*1024*1024, 0);
+        sc_pool_t *gpg  = sc_pool_create(2*1024*1024, gpu ? 1 : 0);
+        sc_pool_t *pgpu = sc_pool_create(vram_sz + 1024*1024, gpu ? 1 : 0);
+
+        sc_tensor_t *a = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dA, A, 0);
+        sc_tensor_t *b = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dB, B, 0);
+        sc_tensor_t *c = sc_tensor_mul_naive(pool, a, b);
+        sc_graph_t *g  = sc_build_graph(meta, pgpu, gpc, gpg, c, backend_flag);
+
+        double total = 0;
+        for (int r = 0; r < REPS; r++) {
+            double t0 = now_ms();
+            sc_graph_forward(pool, pgpu, g);
+            total += now_ms() - t0;
+        }
+        sc_graph_destroy(g);
+        result(label, total / REPS, (double)total_flops * 1e-9);
+        sc_pool_destroy(pool); sc_pool_destroy(meta);
+        sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+    };
+    auto run_cublas = [&]() {
+        float *d_a, *d_b, *d_c;
+        cudaMalloc((void**)&d_a, M*K*sizeof(float));
+        cudaMalloc((void**)&d_b, K*N*sizeof(float));
+        cudaMalloc((void**)&d_c, M*N*sizeof(float));
+        cudaMemcpy(d_a, A, M*K*sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_b, B, K*N*sizeof(float), cudaMemcpyHostToDevice);
+
+        cublasHandle_t handle;
+        cublasCreate(&handle);
+        float alpha = 1.0f, beta = 0.0f;
+
+        // warmup
+        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_b, N, d_a, K, &beta, d_c, N);
+        cudaDeviceSynchronize();
+
+        double total = 0;
+        for (int r = 0; r < REPS; r++) {
+            double t0 = now_ms();
+            cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_b, N, d_a, K, &beta, d_c, N);
+            cudaDeviceSynchronize();
+            total += now_ms() - t0;
+        }
+
+        double ms = total / REPS;
+        double gflops = (double)total_flops / ms * 1e-6;
+        printf("  %-34s  %8.2f ms   %6.2f GFLOPs\n", "Matmul 4096x4096 [cuBLAS]", ms, gflops);
+
+        cublasDestroy(handle);
+        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
+    };
+
+    // run(SC_BACKEND_CPU, "Matmul 512x512 [CPU naive]");
+    run(SC_BACKEND_GPU, "Matmul 4096x4096 [GPU sgemm]");
+    printf("-----------------------------------------RUNING CUBLAS_OP_N--------------------------------------\n");
+    run_cublas();
+
+    delete[] A; delete[] B;
+    printf("\n");
+}
+
+
+
+
+static void bench_xor_training(int backend_flag, const char *label) {
+    float X_data[] = {0,0, 0,1, 1,0, 1,1};
+    float Y_data[] = {0, 1, 1, 0};
+    uint32_t dX[]  = {4, 2}, dY[] = {4, 1};
+    uint32_t dW1[] = {2, 4}, db1[] = {1, 4};
+    uint32_t dW2[] = {4, 1}, db2[] = {1, 1};
+    const int EPOCHS = 10000;
+
+    bool gpu = (backend_flag != SC_BACKEND_CPU);
+    sc_pool_t *pool  = sc_pool_create(4*1024*1024, 0);
+    sc_pool_t *meta  = sc_pool_create(2*1024*1024, 0);
+    sc_pool_t *gpc   = sc_pool_create(1024*1024, 0);
+    sc_pool_t *gpg   = sc_pool_create(gpu ? 1024*1024 : 1024, gpu ? 1 : 0);
+    sc_pool_t *pgpu  = sc_pool_create(gpu ? 4*1024*1024 : 1024, gpu ? 1 : 0);
+
+    sc_tensor_t *X  = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dX, X_data, 0);
+    assert(pool != NULL && meta != NULL && gpc != NULL && gpg != NULL && pgpu != NULL);
+    sc_tensor_t *Y  = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dY, Y_data, 0);
+    sc_tensor_t *W1 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW1, NULL, 1);
+    sc_tensor_t *b1 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db1, NULL, 1);
+    sc_tensor_t *W2 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW2, NULL, 1);
+    sc_tensor_t *b2 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db2, NULL, 1);
+    sc_tensor_fill_random_normal(W1, 0.0f, 0.3f);
+    sc_tensor_fill_random_normal(W2, 0.0f, 0.3f);
+
+    sc_tensor_t *H1   = sc_tensor_mul_naive(pool, X, W1);
+    sc_tensor_t *H1b  = sc_tensor_add(pool, H1, b1);
+    sc_tensor_t *H    = sc_tensor_relu(pool, H1b);
+    sc_tensor_t *O    = sc_tensor_mul_naive(pool, H, W2);
+    sc_tensor_t *Yp   = sc_tensor_add(pool, O, b2);
+    sc_tensor_t *diff = sc_tensor_sub(pool, Yp, Y);
+    sc_tensor_t *sq   = sc_tensor_square(pool, diff);
+    sc_tensor_t *loss = sc_tensor_mean(pool, sq);
+
+    sc_graph_t *g = sc_build_graph(meta, pgpu, gpc, gpg, loss, backend_flag);
+    assert(g != NULL);
+
+    double t0 = now_ms();
+    for (int e = 0; e < EPOCHS; e++)
+        sc_graph_step(pool, pgpu, g, 0.05f);
+    double elapsed = now_ms() - t0;
+
+    float final_loss = sc_graph_get_loss(g);
+    printf("  %-32s  %8.2f ms   loss=%.6f  (%d epochs)\n",
+           label, elapsed, final_loss, EPOCHS);
+
+    sc_graph_destroy(g);
+    sc_pool_destroy(pool); sc_pool_destroy(meta);
+    sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+}
+
+static void bench_before_after_profiling() {
+    header("Benchmark 4: Before/After Profiling Benefit");
+
+    printf("  [BEFORE profiling -> uses hardcoded default thresholds]\n");
+    {
+        system("mv ~/.config/soft-cuda/CONFIG.soft "
+               "~/.config/soft-cuda/CONFIG.soft.bak 2>/dev/null || true");
+        bench_xor_training(SC_BACKEND_HYBRID, "XOR HYBRID [before profiling]");
+        system("mv ~/.config/soft-cuda/CONFIG.soft.bak "
+               "~/.config/soft-cuda/CONFIG.soft 2>/dev/null || true");
+    }
+
+    printf("\n  [AFTER profiling → real measured thresholds]\n");
+    bench_xor_training(SC_BACKEND_HYBRID, "XOR HYBRID [after profiling]");
+    printf("\n");
+}
+
+static void bench_hybrid_network() {
+    header("Benchmark 6: Complex Hybrid Network (Forward Pass)");
+
+    const int B = 128; // Batch size
+    const int D_IN = 256, D_HID = 256, D_SMALL = 16, D_OUT = 1;
+
+    uint32_t dX[] = {B, D_IN};
+    uint32_t dW1[] = {D_IN, D_HID}, db1[] = {1, D_HID};
+    uint32_t dW2[] = {D_HID, D_SMALL}, db2[] = {1, D_SMALL};
+    uint32_t dW3[] = {D_SMALL, D_OUT}, db3[] = {1, D_OUT};
+
+    sc_pool_t *pool = sc_pool_create(16*1024*1024, 0);
+    sc_pool_t *meta = sc_pool_create(4*1024*1024, 0);
+    sc_pool_t *gpc  = sc_pool_create(4*1024*1024, 0);
+    sc_pool_t *gpg  = sc_pool_create(4*1024*1024, 1);
+    sc_pool_t *pgpu = sc_pool_create(16*1024*1024, 1);
+
+    sc_tensor_t *X  = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dX, NULL, 0);
+    sc_tensor_t *W1 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW1, NULL, 0);
+    sc_tensor_t *b1 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db1, NULL, 0);
+    sc_tensor_t *W2 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW2, NULL, 0);
+    sc_tensor_t *b2 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db2, NULL, 0);
+    sc_tensor_t *W3 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW3, NULL, 0);
+    sc_tensor_t *b3 = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db3, NULL, 0);
+
+    // Layer 1: GPU (32k elements)
+    sc_tensor_t *L1_mat = sc_tensor_mul_naive(pool, X, W1);
+    sc_tensor_t *L1_add = sc_tensor_add(pool, L1_mat, b1);
+    sc_tensor_t *L1_out = sc_tensor_relu(pool, L1_add);
+
+    // Layer 2: Mixed (2k elements -> Matmul on GPU, Add/ReLU on CPU)
+    sc_tensor_t *L2_mat = sc_tensor_mul_naive(pool, L1_out, W2);
+    sc_tensor_t *L2_add = sc_tensor_add(pool, L2_mat, b2);
+    sc_tensor_t *L2_out = sc_tensor_relu(pool, L2_add);
+
+    // Layer 3: CPU (128 elements)
+    sc_tensor_t *L3_mat = sc_tensor_mul_naive(pool, L2_out, W3);
+    sc_tensor_t *Yp     = sc_tensor_add(pool, L3_mat, b3);
+
+    sc_graph_t *g = sc_build_graph(meta, pgpu, gpc, gpg, Yp, SC_BACKEND_HYBRID);
+    
+    double t0 = now_ms();
+    sc_graph_forward(pool, pgpu, g);
+    double elapsed = now_ms() - t0;
+
+    result("Hybrid Complex Network", elapsed, 0);
+
+    sc_graph_destroy(g);
+    sc_pool_destroy(pool); sc_pool_destroy(meta);
+    sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+    printf("\n");
+}
+
+// Bench: Large pipeline (add+relu+mean on 4M elements)=================
+static void bench_large_pipeline() {
+    header("Benchmark 3: XOR Training  (10000 epochs)");
+    bench_xor_training(SC_BACKEND_CPU, "XOR [CPU backend]");
+    bench_xor_training(SC_BACKEND_GPU, "XOR [GPU backend]");
+    printf("\n");
+}
+
+
+// BIG test
+static void bench_billion() {
+    header("Benchmark 5: 100-Million Element Reduction (Mean)");
+
+    // 100 Million elements = 400 MB of RAM/VRAM
+    const uint32_t N = 100 * 1000 * 1000;
+    uint32_t d[] = {N};
+    
+    printf("  [Allocating 400MB Host Memory...]\n");
+    fflush(stdout);
+    float *data = new float[N];
+    for (uint32_t i = 0; i < N; i++) data[i] = 1.0f; // Expected mean is 1.0
+
+    auto run = [&](int backend_flag, const char *label) {
+        bool gpu = (backend_flag == SC_BACKEND_GPU);
+        size_t vram_sz = gpu ? (size_t)N * 4 + 4*1024*1024 : 1024;
+        
+        sc_pool_t *pool = sc_pool_create((size_t)N * 4 + 4*1024*1024, 0);
+        sc_pool_t *meta = sc_pool_create(4*1024*1024, 0);
+        sc_pool_t *gpc  = sc_pool_create(4*1024*1024, 0);
+        sc_pool_t *gpg  = sc_pool_create(gpu ? 4*1024*1024 : 1024, gpu ? 1 : 0);
+        sc_pool_t *pgpu = sc_pool_create(vram_sz, gpu ? 1 : 0);
+
+        if (!pool || !meta || !gpc || !gpg || !pgpu) {
+            printf("  [ERROR] Memory allocation failed! pool=%p, meta=%p, pgpu=%p (vram_sz=%zu)\n", pool, meta, pgpu, vram_sz);
+            if (pool) sc_pool_destroy(pool);
+            if (meta) sc_pool_destroy(meta);
+            if (gpc) sc_pool_destroy(gpc);
+            if (gpg) sc_pool_destroy(gpg);
+            if (pgpu) sc_pool_destroy(pgpu);
+            return;
+        }
+
+        sc_tensor_t *A = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 1, d, data, 0);
+        sc_tensor_t *M = sc_tensor_mean(pool, A);
+
+        sc_graph_t *g = sc_build_graph(meta, pgpu, gpc, gpg, M, backend_flag);
+        assert(g != NULL && "Graph build failed");
+
+        double t0 = now_ms();
+        sc_graph_forward(pool, pgpu, g);
+        double elapsed = now_ms() - t0;
+        
+        float result_val = sc_graph_get_loss(g);
+        
+        printf("  %-32s  %8.2f ms   (Mean=%.1f)\n", label, elapsed, result_val);
+        
+        sc_graph_destroy(g);
+        sc_pool_destroy(pool); sc_pool_destroy(meta);
+        sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
+    };
+
+    run(SC_BACKEND_CPU, "Mean 1B [CPU]");
+    run(SC_BACKEND_GPU, "Mean 1B [GPU]");
+
+    delete[] data;
+    printf("\n");
+}
+
+int main(void) {
+    printf("\n");
+    hline();
+    printf("||           soft-cuda  vs  reference  —  Benchmark Suite         ||\n");
+    printf("||     (Run bench_pytorch.py for PyTorch comparison numbers)      ||\n");
+    hline();
+    bench_add();
+    bench_matmul();
+    bench_large_pipeline();
+    bench_before_after_profiling();
+    bench_hybrid_network();
+    bench_billion();
+
+    hline();
+    printf("  Done.\n");
+    hline();
+    return 0;
+}
diff --git a/benchmarks/run_all.sh b/benchmarks/run_all.sh
new file mode 100755
index 0000000..123c21c
--- /dev/null
+++ b/benchmarks/run_all.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# run_all.sh — Full benchmark pipeline
+#
+# Usage:
+#   cd /home/wslarch/Documents/projects/soft-cuda
+#   bash benchmarks/run_all.sh
+#
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="$SCRIPT_DIR/../build_wsl"
+
+echo ""
+echo "          soft-cuda  Complete Benchmark Pipeline                  "
+echo ""
+
+echo ">>> Building (Debug)..."
+mkdir -p "$BUILD_DIR"
+cmake -S "$SCRIPT_DIR/.." -B "$BUILD_DIR" \
+      -DCMAKE_BUILD_TYPE=Debug \
+      -DCMAKE_CUDA_COMPILER=/opt/cuda/bin/nvcc
+cmake --build "$BUILD_DIR" -j$(nproc 2>/dev/null || echo 4)
+echo ""
+
+echo ">>> Running AOT profiler..."
+"$BUILD_DIR/soft_profiler"
+echo ""
+
+echo ">>> Running soft-cuda benchmarks..."
+"$BUILD_DIR/benchmarks/bench_softcuda"
+echo ""
+
+echo ">>> Running Deep MLP Hybrid benchmark..."
+"$BUILD_DIR/benchmarks/bench_deep_mlp"
+echo ""
+
+echo ">>> Running PyTorch benchmark..."
+VENV="$SCRIPT_DIR/../.venv"
+if [ -f "$VENV/bin/activate" ]; then
+    source "$VENV/bin/activate"
+fi
+python3 "$SCRIPT_DIR/bench_pytorch.py"
+echo ""
+
+echo ">>> All benchmarks complete."
diff --git a/include/soft-cuda/profiler/profiler_core.h b/include/soft-cuda/profiler/profiler_core.h
new file mode 100644
index 0000000..04697f7
--- /dev/null
+++ b/include/soft-cuda/profiler/profiler_core.h
@@ -0,0 +1,4 @@
+#pragma once
+
+int soft_profile_and_write(const char *out_path);
+
diff --git a/main.cpp b/main.cpp
index 1620acb..47ce62a 100644
--- a/main.cpp
+++ b/main.cpp
@@ -219,7 +219,7 @@ int main(void) {
         pool_grad_cpu,
         pool_grad_gpu,
         loss,
-        SC_BACKEND_CPU          /* use SC_BACKEND_GPU or SC_BACKEND_HYBRID if CUDA is available */
+        SC_BACKEND_GPU          /* use SC_BACKEND_GPU or SC_BACKEND_HYBRID if CUDA is available */
     );
     assert(graph != NULL);
 
@@ -271,6 +271,8 @@ int main(void) {
     banner("7. Final predictions");
 
     sc_graph_forward(pool, pool_gpu, graph);
+    // Was cause of error
+    sc_autograd_gpu_transfer(graph);  
 
     float *inputs      = (float *)sc_tensor_get_data(X);
     float *targets     = (float *)sc_tensor_get_data(Y);
@@ -314,6 +316,7 @@ int main(void) {
 
     /* Forward pass after reloading to confirm predictions match */
     sc_graph_forward(pool, pool_gpu, graph);
+    sc_autograd_gpu_transfer(graph);
 
     printf("  Post-reload loss : %.8f\n", sc_graph_get_loss(graph));
 
diff --git a/make.sh b/make.sh
index 27ac66a..39822e7 100755
--- a/make.sh
+++ b/make.sh
@@ -1,2 +1,47 @@
-time cmake -B build -DCMAKE_BUILD_TYPE=Debug
-time cmake --build build -j16
+#!/bin/bash
+
+current_dir=$(pwd)
+
+usage() {
+  echo -e "Usage: $(basename "$0") [flags]\n"
+  echo "  -b   Build (debug)"
+  echo "  -z   Clean build (remove build/ first)"
+  echo "  -x   Clean build (interactive, prompts before deleting)"
+  echo "  -r   Run (stderr suppressed)"
+  echo "  -v   Run (verbose, with timing)"
+  echo "  -t   Run tests (ctest)"
+  echo "  -m   Run benchmarks"
+  echo "  -h   Show this help"
+}
+
+build() {
+  time cmake -B build -DCMAKE_BUILD_TYPE=Debug
+  time cmake --build build -j$(nproc)
+  builtin  echo "===== BUILD COMPLETE ====="
+}
+
+while getopts "brhvtzxm" opt; do
+  case "$opt" in
+    h) usage ;;
+    b) build ;;
+    z|x)
+      if [[ -d "build" && $opt == "x" ]]; then
+        rm -rfi "${current_dir}/build/"
+      else
+        rm -rf "${current_dir}/build/"
+      fi
+      build
+      ;;
+    v) echo "===== RUNNING ====="
+       time "${current_dir}/build/soft" ;;
+    r) time "${current_dir}/build/soft" 2>/dev/null ;;
+    t) cd build && ctest ;;
+    m) echo "===== BENCHMARKS ====="
+       "${current_dir}/benchmarks/run_all.sh" 2>/dev/null ;;
+   \?) usage; exit 1 ;;
+  esac
+done
+
+if [[ $OPTIND -eq 1 ]]; then
+  usage
+fi
diff --git a/src/backend_cpu/backprop/backprop_b.cpp b/src/backend_cpu/backprop/backprop_b.cpp
index c8f2712..bef9b52 100644
--- a/src/backend_cpu/backprop/backprop_b.cpp
+++ b/src/backend_cpu/backprop/backprop_b.cpp
@@ -1,6 +1,4 @@
 #include "internal_header.h"
-
-
 // We are gonna write a backprop fucntion for matmul cause that is easy thing to do
 // Since our backprop is an eagar evaluation we will make tensor objects inside the function
 // transpose it execute it as well and then work with it
@@ -9,40 +7,93 @@
 // Since we are going to get execution node we could have a decision made here if it's data will be
 // written on device_ptr of node_.grad->data
 // We are gonna do this cause making decsion on higher level will require us to make two kernels for similar work.
+/// @brief Initializes the gradient buffers for a list of execution nodes.
+/// @param nodes A vector of pointers to execution nodes.
 void gradInitializer(std::vector<execution_node_t *> &nodes) {
     for (auto &node : nodes) {
         if (!node->t->grad_compute) continue;
-        memset(node->t->grad->data, 0,node->t->grad->nvalues * sizeof(float));
-        // TODO: We are not zeroing the GPU will have to do that
+        // Zero CPU grad buffer
+        memset(node->t->grad->data, 0, node->t->grad->nvalues * sizeof(float));
+        // Zero GPU grad buffer if it exists
+        if (node->device_ptr_grad != NULL) {
+            soft_cuda_memset_zero(node->device_ptr_grad, node->t->nvalues * sizeof(float));
+        }
     }
+    // Seed the root node's gradient = 1 (dL/dL = 1)
     if (!nodes.empty()) {
         execution_node_t *root = nodes.back();
-        if (root->t != nullptr && root->t->grad != nullptr && root->t->grad->data != nullptr) {
-            ((float*)root->t->grad->data)[0] = 1.0f;
+        if (root->t != nullptr && root->t->grad != nullptr &&
+            root->t->grad->data != nullptr) {
+            ((float *)root->t->grad->data)[0] = 1.0f;
+            // If root is on GPU, seed its GPU grad buffer too
+            if (root->device_ptr_grad != NULL) {
+                float one = 1.0f;
+                soft_cuda_memcpy_h2d(root->device_ptr_grad, &one, sizeof(float));
+            }
         }
     }
 }
-// Since this will be eagar evalution we are gonna execute it here now and we are gonna return a
-// boolean.
-// @return: boolean
-// @params: execution_node_t
-// Since all the memory is already executed we just need the execution node
 
+/// @brief Executes the backward pass (backpropagation) across a list of execution nodes.
+/// @param nodes A vector of pointers to execution nodes in forward execution order.
+/// @return true if backpropagation was successful for all nodes, false otherwise.
 bool backprop__(std::vector<execution_node_t *> &nodes) {
     for (int i = (int)nodes.size() - 1; i >= 0; i--) {
         execution_node_t *node = nodes[(size_t)i];
-        // Cause it's not needed
-        if (!node->t->grad_compute)
-            continue;
-        // Cause it's super child
-        if (node->t->op == tensor_op_t::NONE)
-            continue;
-        bool success{false};
+        if (!node->t->grad_compute) continue;
+        if (node->t->op == tensor_op_t::NONE) continue;
+
+        bool success = false;
+
         if (node->device_ptr_grad != NULL) {
-            success = backprop_gpu(node);
+            // Resolve parent execution nodes for the GPU backward kernels
+            execution_node_t *parent_a = nullptr;
+            execution_node_t *parent_b = nullptr;
+            int32_t a_idx = getTheExecutionNodeIndex(node, 0);
+            int32_t b_idx = getTheExecutionNodeIndex(node, 1);
+            if (a_idx >= 0) parent_a = nodes[(size_t)a_idx];
+            if (b_idx >= 0) parent_b = nodes[(size_t)b_idx];
+
+            success = backprop_gpu_dispatch(node, parent_a, parent_b);
+            if (!success) {
+                // Graceful fallback: copy grad from GPU, run CPU backward
+                debug("backprop__: GPU backward unsupported for op=%d, falling back to CPU\n",
+                      (int)node->t->op);
+                // Pull this node's grad from GPU to CPU
+                if (node->device_ptr_grad != NULL && node->t->grad != NULL) {
+                    soft_cuda_memcpy_d2h(node->t->grad->data, node->device_ptr_grad,
+                                         node->t->grad->nvalues * sizeof(float));
+                }
+                // Pull parent forward data from GPU if needed
+                if (parent_a && parent_a->device_ptr != NULL &&
+                    parent_a->t->device == device_type::GPU) {
+                    soft_cuda_memcpy_d2h(parent_a->t->data, parent_a->device_ptr,
+                                         parent_a->t->nvalues * sizeof(float));
+                }
+                if (parent_b && parent_b->device_ptr != NULL &&
+                    parent_b->t->device == device_type::GPU) {
+                    soft_cuda_memcpy_d2h(parent_b->t->data, parent_b->device_ptr,
+                                         parent_b->t->nvalues * sizeof(float));
+                }
+                success = backprop_cpu(node);
+                // Push parent grads back to GPU so SGD sees them
+                if (parent_a && parent_a->device_ptr_grad != NULL &&
+                    parent_a->t->grad != NULL) {
+                    soft_cuda_memcpy_h2d(parent_a->device_ptr_grad,
+                                         parent_a->t->grad->data,
+                                         parent_a->t->grad->nvalues * sizeof(float));
+                }
+                if (parent_b && parent_b->device_ptr_grad != NULL &&
+                    parent_b->t->grad != NULL) {
+                    soft_cuda_memcpy_h2d(parent_b->device_ptr_grad,
+                                         parent_b->t->grad->data,
+                                         parent_b->t->grad->nvalues * sizeof(float));
+                }
+            }
         } else {
             success = backprop_cpu(node);
         }
+
         if (!success) {
             debug("backprop__: FUBAR at node pos=%d\n", (int)node->pos);
             return false;
@@ -55,6 +106,9 @@ bool backprop__(std::vector<execution_node_t *> &nodes) {
 // CPU BACKWARD DISPATCHER
 // ================================================================================ 
 
+/// @brief CPU backward dispatcher. Dispatches to the appropriate CPU gradient operation based on the node's tensor operation.
+/// @param node Pointer to the execution node.
+/// @return true if the backward operation was successful, false otherwise.
 bool backprop_cpu(execution_node_t *node) {
     assert(node != NULL);
     assert(node->t != NULL);
@@ -69,12 +123,12 @@ bool backprop_cpu(execution_node_t *node) {
         // TODO: Implement here
         success = false;
         break;
-    // case tensor_op_t::MUL_MATRIX:
-    //     assert(node->t->a != NULL);
-    //     assert(node->t->b != NULL);
-    //     assert(node->t->a->dtype == node->t->b->dtype);
-    //     success = tensor_mul_grad_op_matrix(node);
-    //     break;
+    case tensor_op_t::MUL_MATRIX:
+        assert(node->t->a != NULL);
+        assert(node->t->b != NULL);
+        assert(node->t->a->dtype == node->t->b->dtype);
+        success = tensor_mul_grad_op_matrix(node);
+        break;
     case tensor_op_t::MUL_SCALAR:
         assert(node->t->a != NULL);
         assert(node->t->b != NULL);
@@ -132,17 +186,19 @@ bool backprop_cpu(execution_node_t *node) {
     return success;
 }
 
-// =============================================================================
-// GPU BACKWARD DISPATCHER (STUB)
-// =============================================================================
-bool backprop_gpu([[maybe_unused]] execution_node_t *node) {
-    assert(node != NULL);
-    assert(node->t != NULL);
-    // TODO: Implement GPU backward kernels
-    debug("backprop_gpu: not yet implemented, falling back\n");
-    return false;
-}
+// backprop_gpu() is no longer used directly.
+// The full GPU backward implementation lives in
+// src/backend_gpu/backprop/backprop_gpu.cu :: backprop_gpu_dispatch().
+// backprop__() above calls backprop_gpu_dispatch(node, parent_a, parent_b).
+//
+// This stub is kept so any legacy call sites get a linker error rather than
+// silent incorrect behavior.
+// (removed stub)
 // TODO: ONE PROBLEM IS WE NEED MULTIPLE CASE FOR 
+
+/// @brief Computes the gradient for element-wise addition operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_grad_op_add(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a = node->t->a;
@@ -161,6 +217,9 @@ bool tensor_grad_op_add(execution_node_t *node) {
     return true;
 }
 
+/// @brief Computes the gradient for element-wise broadcasting addition operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_grad_op_broadcasting_add(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a = node->t->a;
@@ -193,6 +252,9 @@ bool tensor_grad_op_broadcasting_add(execution_node_t *node) {
     return true;
 }
 
+/// @brief Computes the gradient for element-wise subtraction operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_grad_op_sub(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
@@ -213,6 +275,9 @@ bool tensor_grad_op_sub(execution_node_t *node) {
 
 
 
+/// @brief Computes the gradient for the ReLU activation function.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_grad_op_relu(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
@@ -233,6 +298,9 @@ bool tensor_grad_op_relu(execution_node_t *node) {
     return true;
 }
 
+/// @brief Computes the gradient for the mean operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_grad_op_mean(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
@@ -252,6 +320,9 @@ bool tensor_grad_op_mean(execution_node_t *node) {
     return true;
 }
 
+/// @brief Computes the gradient for scalar multiplication operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_mul_grad_op_scalar(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
@@ -272,6 +343,9 @@ bool tensor_mul_grad_op_scalar(execution_node_t *node) {
 }
 
 
+/// @brief Computes the gradient for matrix transpose operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_tranpose_grad_op_matrix(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
@@ -292,6 +366,9 @@ bool tensor_tranpose_grad_op_matrix(execution_node_t *node) {
     return true;
 }
 
+/// @brief Computes the gradient for naive matrix multiplication operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_mul_grad_op_matrix_naive(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
@@ -336,6 +413,9 @@ bool tensor_mul_grad_op_matrix_naive(execution_node_t *node) {
     return true;
 }
 
+/// @brief Computes the gradient for element-wise square operation.
+/// @param node Pointer to the execution node.
+/// @return true if successful.
 bool tensor_grad_op_square(execution_node_t *node) {
     tensor_t *out_grad = node->t->grad;
     tensor_t *a        = node->t->a;
diff --git a/src/backend_cpu/backprop/backprop_cuda_bridge.cu b/src/backend_cpu/backprop/backprop_cuda_bridge.cu
new file mode 100644
index 0000000..11032e4
--- /dev/null
+++ b/src/backend_cpu/backprop/backprop_cuda_bridge.cu
@@ -0,0 +1,15 @@
+#include "internal_header.h"
+#include <cuda_runtime.h>
+#include <cassert>
+
+void soft_cuda_memset_zero(void *ptr, size_t bytes) {
+    cudaMemset(ptr, 0, bytes);
+}
+
+void soft_cuda_memcpy_h2d(void *dst, const void *src, size_t bytes) {
+    cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
+}
+
+void soft_cuda_memcpy_d2h(void *dst, const void *src, size_t bytes) {
+    cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
+}
diff --git a/src/backend_cpu/backprop/matmul_b.cpp b/src/backend_cpu/backprop/matmul_b.cpp
index e69de29..106025e 100644
--- a/src/backend_cpu/backprop/matmul_b.cpp
+++ b/src/backend_cpu/backprop/matmul_b.cpp
@@ -0,0 +1,45 @@
+#include "internal_header.h"
+
+bool tensor_mul_grad_op_matrix(execution_node_t *node) {
+  
+    tensor_t *out_grad = node->t->grad;
+    tensor_t *a        = node->t->a;
+    tensor_t *b        = node->t->b;
+    assert(out_grad != NULL && a != NULL && b != NULL);
+
+    uint32_t M = a->dims[0];
+    uint32_t K = a->dims[1];
+    uint32_t N = b->is_transposed ? b->dims[0] : b->dims[1];
+
+    float *g_out  = (float *)out_grad->data;           
+    float *a_data = (float *)a->data;                  
+    float *b_data = (float *)b->data;                  
+    float *g_a    = a->grad ? (float *)a->grad->data : NULL;  
+    float *g_b    = b->grad ? (float *)b->grad->data : NULL; 
+
+    if (g_a) {
+        for (uint32_t i = 0; i < M; i++) {
+            for (uint32_t k = 0; k < K; k++) {
+                float sum = 0.0f;
+                for (uint32_t j = 0; j < N; j++) {
+                    sum += g_out[i * N + j] * b_data[k * N + j];
+                }
+                g_a[i * K + k] += sum;
+            }
+        }
+    }
+
+    if (g_b) {
+        for (uint32_t k = 0; k < K; k++) {
+            for (uint32_t j = 0; j < N; j++) {
+                float sum = 0.0f;
+                for (uint32_t i = 0; i < M; i++) {
+                    sum += a_data[i * K + k] * g_out[i * N + j];
+                }
+                g_b[k * N + j] += sum;
+            }
+        }
+    }
+
+    return true;
+}
diff --git a/src/backend_cpu/include/backprop/backprop_b.h b/src/backend_cpu/include/backprop/backprop_b.h
index 6dd53fb..a83b694 100644
--- a/src/backend_cpu/include/backprop/backprop_b.h
+++ b/src/backend_cpu/include/backprop/backprop_b.h
@@ -6,8 +6,10 @@ bool backprop__(std::vector<execution_node_t *> &nodes);
 
 bool backprop_cpu(execution_node_t *node);
 
-bool backprop_gpu([[maybe_unused]] execution_node_t *node);
 
+void soft_cuda_memset_zero(void *ptr, size_t bytes);
+void soft_cuda_memcpy_h2d(void *dst, const void *src, size_t bytes);
+void soft_cuda_memcpy_d2h(void *dst, const void *src, size_t bytes);
 bool tensor_grad_op_add(execution_node_t *node);
 
 bool tensor_grad_op_broadcasting_add(execution_node_t *node);
@@ -24,4 +26,6 @@ bool tensor_tranpose_grad_op_matrix(execution_node_t *node);
 
 bool tensor_mul_grad_op_matrix_naive(execution_node_t *node);
 
+bool tensor_mul_grad_op_matrix(execution_node_t *node);
+
 bool tensor_grad_op_square(execution_node_t *node);
diff --git a/src/backend_gpu/backprop/backprop_gpu.cu b/src/backend_gpu/backprop/backprop_gpu.cu
new file mode 100644
index 0000000..b4e3782
--- /dev/null
+++ b/src/backend_gpu/backprop/backprop_gpu.cu
@@ -0,0 +1,330 @@
+#include "internal_header.h"
+#include <cuda_runtime.h>
+#include <cassert>
+
+
+
+__global__
+static void grad_add_k(const float *g_out,
+                        float *g_a, float *g_b,
+                        uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride) {
+        if (g_a) g_a[i] += g_out[i];
+        if (g_b) g_b[i] += g_out[i];
+    }
+}
+
+/* -------------------------------------------------------------------------
+ * SUB backward
+ *   dL/dA[i] += dL/dOut[i]
+ *   dL/dB[i] -= dL/dOut[i]
+ * ---------------------------------------------------------------------- */
+__global__
+static void grad_sub_k(const float *g_out,
+                        float *g_a, float *g_b,
+                        uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride) {
+        if (g_a) g_a[i] += g_out[i];
+        if (g_b) g_b[i] -= g_out[i];
+    }
+}
+
+/* -------------------------------------------------------------------------
+ * RELU backward
+ *   dL/dA[i] += dL/dOut[i]  if A[i] > 0, else 0
+ *   d_a_fwd : the forward input values (needed for the mask)
+ * ---------------------------------------------------------------------- */
+__global__
+static void grad_relu_k(const float *g_out,
+                         const float *a_fwd,   /* forward input */
+                         float *g_a,
+                         uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride)
+        if (g_a) g_a[i] += (a_fwd[i] > 0.f) ? g_out[i] : 0.f;
+}
+
+/* -------------------------------------------------------------------------
+ * SQUARE backward
+ *   dL/dA[i] += 2 * A[i] * dL/dOut[i]
+ * ---------------------------------------------------------------------- */
+__global__
+static void grad_square_k(const float *g_out,
+                            const float *a_fwd,
+                            float *g_a,
+                            uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride)
+        if (g_a) g_a[i] += 2.f * a_fwd[i] * g_out[i];
+}
+
+/* -------------------------------------------------------------------------
+ * MEAN backward
+ *   dL/dA[i] += dL/dOut[0] / N     (uniform distribution)
+ *   g_out has exactly 1 element (it's a scalar).
+ * ---------------------------------------------------------------------- */
+__global__
+static void grad_mean_k(const float *g_out_scalar,
+                         float *g_a,
+                         uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    float scale = g_out_scalar[0] / (float)n;
+    for (; i < n; i += stride)
+        if (g_a) g_a[i] += scale;
+}
+
+/* -------------------------------------------------------------------------
+ * MUL_SCALAR backward
+ *   dL/dA[i] += dL/dOut[i] * scalar
+ *   scalar is a CPU-side float (the scalar tensor b stays on host)
+ * ---------------------------------------------------------------------- */
+__global__
+static void grad_scalar_mul_k(const float *g_out,
+                                float *g_a,
+                                float scalar,
+                                uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride)
+        if (g_a) g_a[i] += g_out[i] * scalar;
+}
+
+/* -------------------------------------------------------------------------
+ * BROADCAST_ADD backward
+ *   dL/dA[i,j] += dL/dOut[i,j]                   (full pass-through)
+ *   dL/dBias[j] += sum_i dL/dOut[i,j]             (column-wise sum)
+ *
+ *   We run two separate kernels:
+ *     1. Identity for g_a (same as ADD kernel)
+ *     2. Parallel column-reduce for g_bias
+ * ---------------------------------------------------------------------- */
+__global__
+static void grad_broadcast_add_a_k(const float *g_out,
+                                     float *g_a,
+                                     uint32_t total) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < total; i += stride)
+        if (g_a) g_a[i] += g_out[i];
+}
+
+/* Each thread handles one column of the bias gradient.
+ * Accumulates over all rows. */
+__global__
+static void grad_broadcast_add_bias_k(const float *g_out,
+                                        float *g_b,
+                                        uint32_t rows,
+                                        uint32_t cols) {
+    uint32_t col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (col >= cols) return;
+    float acc = 0.f;
+    for (uint32_t row = 0; row < rows; row++)
+        acc += g_out[row * cols + col];
+    if (g_b) g_b[col] += acc;
+}
+
+/* -------------------------------------------------------------------------
+ * NAIVE_MATRIX_MUL backward
+ *
+ *   C = A × B    (M × K) × (K × N) → (M × N)
+ *
+ *   dL/dA[i,k] += sum_j  dL/dC[i,j] * B[k,j]   ≡  g_C × B^T
+ *   dL/dB[k,j] += sum_i  A[i,k]     * dL/dC[i,j] ≡  A^T × g_C
+ *
+ * We implement two separate SGEMM kernels using BLOCK_SIZE tiles.
+ * ---------------------------------------------------------------------- */
+#define MATMUL_BLOCK 32
+
+/* dA = g_C × B^T   shape: (M, N) × (N, K)^T → (M, K)
+ * Equivalently: dA[i,k] = sum_j g_C[i,j] * B[k,j] */
+__global__
+static void grad_matmul_dA_k(const float *g_C,  // (M, N) 
+                               const float *B,    // (K, N) 
+                               float *g_A,        // (M, K) 
+                               uint32_t M, uint32_t K, uint32_t N) {
+    uint32_t row = blockIdx.y * blockDim.y + threadIdx.y; // i ∈ [0,M) 
+    uint32_t col = blockIdx.x * blockDim.x + threadIdx.x; // k ∈ [0,K) 
+    if (row >= M || col >= K) return;
+    float acc = 0.f;
+    for (uint32_t j = 0; j < N; j++)
+        acc += g_C[row * N + j] * B[col * N + j]; /* B[k,j] */
+    if (g_A) g_A[row * K + col] += acc;
+}
+
+// dB = A^T × g_C   shape: (K, M)^T × (M, N) → (K, N)
+//  Equivalently: dB[k,j] = sum_i A[i,k] * g_C[i,j] 
+__global__
+static void grad_matmul_dB_k(const float *A,    // (M, K) 
+                               const float *g_C, // (M, N) 
+                               float *g_B,       // (K, N) 
+                               uint32_t M, uint32_t K, uint32_t N) {
+    uint32_t row = blockIdx.y * blockDim.y + threadIdx.y;  //k ∈ [0,K) 
+    uint32_t col = blockIdx.x * blockDim.x + threadIdx.x;//  j ∈ [0,N) 
+    if (row >= K || col >= N) return;
+    float acc = 0.f;
+    for (uint32_t i = 0; i < M; i++)
+        acc += A[i * K + row] * g_C[i * N + col];
+    if (g_B) g_B[row * N + col] += acc;
+}
+
+/* =========================================================================
+ * For each node, we need:
+ *   g_out       : this node's grad buffer (device)     = node->device_ptr_grad
+ *   d_a_fwd     : parent A forward data (device)       = parent_a_exec->device_ptr
+ *   d_b_fwd     : parent B forward data (device)       = parent_b_exec->device_ptr
+ *   g_a         : parent A grad buffer  (device)       = parent_a_exec->device_ptr_grad
+ *   g_b         : parent B grad buffer  (device)       = parent_b_exec->device_ptr_grad
+ *
+* We need parent execution nodes during GPU backprop, but we don't want to modify the struct. Since the caller already has the whole graph, we just pass parent nodes as arguments instead.
+ * ========================================================================= */
+
+bool backprop_gpu_dispatch(execution_node_t *node,
+                            execution_node_t *parent_a,
+                            execution_node_t *parent_b) {
+    assert(node != NULL);
+    assert(node->t != NULL);
+    assert(node->device_ptr_grad != NULL);
+
+    float *g_out = (float *)node->device_ptr_grad;
+    uint32_t n   = node->t->nvalues;
+    int block    = 256;
+    int grid     = ((int)n + block - 1) / block;
+
+    float *d_a_fwd = (parent_a) ? (float *)parent_a->device_ptr      : nullptr;
+    float *d_b_fwd = (parent_b) ? (float *)parent_b->device_ptr      : nullptr;
+
+    float *g_a     = (parent_a && parent_a->device_ptr_grad)
+                         ? (float *)parent_a->device_ptr_grad : nullptr;
+    float *g_b     = (parent_b && parent_b->device_ptr_grad)
+                         ? (float *)parent_b->device_ptr_grad : nullptr;
+
+    cudaError_t err = cudaSuccess;
+
+    switch (node->t->op) {
+    case tensor_op_t::ADD:
+        grad_add_k<<<grid, block>>>(g_out, g_a, g_b, n);
+        break;
+
+    case tensor_op_t::SUB:
+        grad_sub_k<<<grid, block>>>(g_out, g_a, g_b, n);
+        break;
+
+    case tensor_op_t::RELU:
+        if (d_a_fwd == nullptr) return false;  
+        grad_relu_k<<<grid, block>>>(g_out, d_a_fwd, g_a, n);
+        break;
+
+    case tensor_op_t::SQUARE:
+        if (d_a_fwd == nullptr) return false;
+        grad_square_k<<<grid, block>>>(g_out, d_a_fwd, g_a, n);
+        break;
+
+    case tensor_op_t::MEAN: {
+        uint32_t na = node->t->a->nvalues;
+        int ga = ((int)na + block - 1) / block;
+        grad_mean_k<<<ga, block>>>(g_out, g_a, na);
+        break;
+    }
+
+    case tensor_op_t::MUL_SCALAR: {
+        float scalar = ((float *)node->t->b->data)[0]; /* scalar on CPU */
+        grad_scalar_mul_k<<<grid, block>>>(g_out, g_a, scalar, n);
+        break;
+    }
+
+    case tensor_op_t::BROADCAST_ADD: {
+        uint32_t rows = node->t->dims[0];
+        uint32_t cols = node->t->dims[1];
+        uint32_t total = rows * cols;
+        int g_total = ((int)total + block - 1) / block;
+        grad_broadcast_add_a_k<<<g_total, block>>>(g_out, g_a, total);
+        int g_cols = ((int)cols + block - 1) / block;
+        grad_broadcast_add_bias_k<<<g_cols, block>>>(g_out, g_b, rows, cols);
+        break;
+    }
+
+    case tensor_op_t::NAIVE_MATRIX_MUL: {
+        assert(node->t->a != nullptr);
+        assert(node->t->b != nullptr);
+        if (d_a_fwd == nullptr || d_b_fwd == nullptr) return false;
+
+        uint32_t M = node->t->a->dims[0];  
+        uint32_t K = node->t->a->dims[1]; 
+        uint32_t N = node->t->b->dims[1];
+
+        dim3 blk(MATMUL_BLOCK, MATMUL_BLOCK);
+        dim3 grd_dA((K + MATMUL_BLOCK - 1) / MATMUL_BLOCK,
+                    (M + MATMUL_BLOCK - 1) / MATMUL_BLOCK);
+        dim3 grd_dB((N + MATMUL_BLOCK - 1) / MATMUL_BLOCK,
+                    (K + MATMUL_BLOCK - 1) / MATMUL_BLOCK);
+
+        if (g_a)
+            grad_matmul_dA_k<<<grd_dA, blk>>>(g_out, d_b_fwd, g_a, M, K, N);
+        if (g_b)
+            grad_matmul_dB_k<<<grd_dB, blk>>>(d_a_fwd, g_out, g_b, M, K, N);
+        break;
+    }
+
+    case tensor_op_t::MUL_MATRIX: {
+        assert(node->t->a != nullptr);
+        assert(node->t->b != nullptr);
+        if (d_a_fwd == nullptr || d_b_fwd == nullptr) return false;
+
+        uint32_t M = node->t->a->dims[0];
+        uint32_t K = node->t->a->dims[1];
+        uint32_t N = node->t->b->dims[node->t->b->is_transposed ? 0 : 1];
+
+        dim3 blk(MATMUL_BLOCK, MATMUL_BLOCK);
+        dim3 grd_dA((K + MATMUL_BLOCK - 1) / MATMUL_BLOCK,
+                    (M + MATMUL_BLOCK - 1) / MATMUL_BLOCK);
+        dim3 grd_dB((N + MATMUL_BLOCK - 1) / MATMUL_BLOCK,
+                    (K + MATMUL_BLOCK - 1) / MATMUL_BLOCK);
+
+        if (g_a)
+            grad_matmul_dA_k<<<grd_dA, blk>>>(g_out, d_b_fwd, g_a, M, K, N);
+        if (g_b)
+            grad_matmul_dB_k<<<grd_dB, blk>>>(d_a_fwd, g_out, g_b, M, K, N);
+        break;
+    }
+
+    case tensor_op_t::TRANSPOSE:
+        return false;
+
+    case tensor_op_t::NONE:
+    case tensor_op_t::CAST:
+        return true;
+
+    default:
+        debug("backprop_gpu_dispatch: unhandled op=%d\n", (int)node->t->op);
+        return false;
+    }
+
+    err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        debug("backprop_gpu_dispatch: CUDA error: %s\n", cudaGetErrorString(err));
+        return false;
+    }
+    return true;
+}
+
+__global__ static void gpu_sgd_k(float *w, const float *g, float lr, uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        w[i] -= lr * g[i];
+    }
+}
+
+extern "C" bool tensor_sgd_gpu(float *d_w, float *d_g, float lr, uint32_t n) {
+    int block = 256;
+    int grid = ((int)n + block - 1) / block;
+    gpu_sgd_k<<<grid, block>>>(d_w, d_g, lr, n);
+    return cudaDeviceSynchronize() == cudaSuccess;
+}
+
diff --git a/src/backend_gpu/include/backprop/backprop_gpu.h b/src/backend_gpu/include/backprop/backprop_gpu.h
new file mode 100644
index 0000000..bfd7340
--- /dev/null
+++ b/src/backend_gpu/include/backprop/backprop_gpu.h
@@ -0,0 +1,9 @@
+#pragma once
+
+extern "C" {
+bool backprop_gpu_dispatch(execution_node_t *node,
+                            execution_node_t *parent_a,
+                            execution_node_t *parent_b);
+
+bool tensor_sgd_gpu(float *d_w, float *d_g, float lr, uint32_t n);
+}
diff --git a/src/backend_gpu/kernels/sgemm_double_buffer.cuh b/src/backend_gpu/kernels/sgemm_double_buffer.cuh
new file mode 100644
index 0000000..c9d3382
--- /dev/null
+++ b/src/backend_gpu/kernels/sgemm_double_buffer.cuh
@@ -0,0 +1,231 @@
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cooperative_groups.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cublas_v2.h>
+#include <cuda/barrier>
+#include <cuda_runtime.h>
+
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
+#define WARPSIZE 32
+
+namespace {
+template <const int BM, const int BN, const int BK, const int rowStrideA,
+          const int rowStrideB, typename T>
+__device__ void loadFromGmem(int N, int K, float *A, float *B, float *As,
+                             float *Bs, int innerRowA, int innerColA,
+                             int innerRowB, int innerColB, T &barrier) {
+
+  for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) {
+    cuda::memcpy_async(&As[(innerColA * 4 + 0) * BM + innerRowA + offset],
+                       &A[(innerRowA + offset) * K + innerColA * 4],
+                       cuda::aligned_size_t<sizeof(float)>(sizeof(float)),
+                       barrier);
+    cuda::memcpy_async(&As[(innerColA * 4 + 1) * BM + innerRowA + offset],
+                       &A[(innerRowA + offset) * K + innerColA * 4 + 1],
+                       cuda::aligned_size_t<sizeof(float)>(sizeof(float)),
+                       barrier);
+    cuda::memcpy_async(&As[(innerColA * 4 + 2) * BM + innerRowA + offset],
+                       &A[(innerRowA + offset) * K + innerColA * 4 + 2],
+                       cuda::aligned_size_t<sizeof(float)>(sizeof(float)),
+                       barrier);
+    cuda::memcpy_async(&As[(innerColA * 4 + 3) * BM + innerRowA + offset],
+                       &A[(innerRowA + offset) * K + innerColA * 4 + 3],
+                       cuda::aligned_size_t<sizeof(float)>(sizeof(float)),
+                       barrier);
+  }
+
+  for (uint offset = 0; offset + rowStrideB <= BK; offset += rowStrideB) {
+    cuda::memcpy_async(&Bs[(innerRowB + offset) * BN + innerColB * 4],
+                       &B[(innerRowB + offset) * N + innerColB * 4],
+                       cuda::aligned_size_t<sizeof(float4)>(sizeof(float4)),
+                       barrier);
+  }
+}
+
+template <const int BM, const int BN, const int BK, const int WM, const int WN,
+          const int WMITER, const int WNITER, const int WSUBM, const int WSUBN,
+          const int TM, const int TN>
+__device__ void
+processFromSmem(float *regM, float *regN, float *threadResults, const float *As,
+                const float *Bs, const uint warpRow, const uint warpCol,
+                const uint threadRowInWarp, const uint threadColInWarp) {
+  for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
+    // populate registers for whole warptile
+    for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
+      for (uint i = 0; i < TM; ++i) {
+        regM[wSubRowIdx * TM + i] =
+            As[(dotIdx * BM) + warpRow * WM + wSubRowIdx * WSUBM +
+               threadRowInWarp * TM + i];
+      }
+    }
+    for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
+      for (uint i = 0; i < TN; ++i) {
+        regN[wSubColIdx * TN + i] =
+            Bs[(dotIdx * BN) + warpCol * WN + wSubColIdx * WSUBN +
+               threadColInWarp * TN + i];
+      }
+    }
+
+    // execute warptile matmul
+    for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
+      for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
+        // calculate per-thread results
+        for (uint resIdxM = 0; resIdxM < TM; ++resIdxM) {
+          for (uint resIdxN = 0; resIdxN < TN; ++resIdxN) {
+            threadResults[(wSubRowIdx * TM + resIdxM) * (WNITER * TN) +
+                          (wSubColIdx * TN) + resIdxN] +=
+                regM[wSubRowIdx * TM + resIdxM] *
+                regN[wSubColIdx * TN + resIdxN];
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+/*
+ * @tparam BM The threadblock size for M dimension SMEM caching.
+ * @tparam BN The threadblock size for N dimension SMEM caching.
+ * @tparam BK The threadblock size for K dimension SMEM caching.
+ * @tparam WM M dim of continuous tile computed by each warp
+ * @tparam WN N dim of continuous tile computed by each warp
+ * @tparam WMITER The number of subwarp tiling steps in M dimension.
+ * @tparam WNITER The number of subwarp tiling steps in N dimension.
+ * @tparam TM The per-thread tile size for M dimension.
+ * @tparam TN The per-thread tile size for N dimension.
+ */
+template <const int BM, const int BN, const int BK, const int WM, const int WN,
+          const int WNITER, const int TM, const int TN, const int NUM_THREADS>
+__global__ void __launch_bounds__(NUM_THREADS)
+    runSgemmDoubleBuffering2(int M, int N, int K, float alpha, float *A,
+                             float *B, float beta, float *C) {
+  auto block = cooperative_groups::this_thread_block();
+  __shared__ cuda::barrier<cuda::thread_scope::thread_scope_block> frontBarrier;
+  __shared__ cuda::barrier<cuda::thread_scope::thread_scope_block> backBarrier;
+  auto frontBarrierPtr = &frontBarrier;
+  auto backBarrierPtr = &backBarrier;
+  if (block.thread_rank() == 0) {
+    init(&frontBarrier, block.size());
+    init(&backBarrier, block.size());
+  }
+  __syncthreads();
+
+  const uint cRow = blockIdx.y;
+  const uint cCol = blockIdx.x;
+
+  // Placement of the warp in the threadblock tile
+  const uint warpIdx = threadIdx.x / WARPSIZE; // the warp this thread is in
+  const uint warpCol = warpIdx % (BN / WN);
+  const uint warpRow = warpIdx / (BN / WN);
+
+  // size of the warp subtile
+  constexpr uint WMITER = (WM * WN) / (WARPSIZE * TM * TN * WNITER);
+  constexpr uint WSUBM = WM / WMITER; // 64/2=32
+  constexpr uint WSUBN = WN / WNITER; // 32/2=16
+
+  // Placement of the thread in the warp subtile
+  const uint threadIdxInWarp = threadIdx.x % WARPSIZE;         // [0, 31]
+  const uint threadColInWarp = threadIdxInWarp % (WSUBN / TN); // i%(16/4)
+  const uint threadRowInWarp = threadIdxInWarp / (WSUBN / TN); // i/4
+
+  // allocate space for the current blocktile in SMEM
+  __shared__ float As[2 * BM * BK];
+  __shared__ float Bs[2 * BK * BN];
+
+  // Move blocktile to beginning of A's row and B's column
+  A += cRow * BM * K;
+  B += cCol * BN;
+  // Move C_ptr to warp's output tile
+  C += (cRow * BM + warpRow * WM) * N + cCol * BN + warpCol * WN;
+
+  // calculating the indices that this thread will load into SMEM
+  // we'll load 128bit / 32bit = 4 elements per thread at each step
+  const uint innerRowA = threadIdx.x / (BK / 4);
+  const uint innerColA = threadIdx.x % (BK / 4);
+  constexpr uint rowStrideA = (NUM_THREADS * 4) / BK;
+  const uint innerRowB = threadIdx.x / (BN / 4);
+  const uint innerColB = threadIdx.x % (BN / 4);
+  constexpr uint rowStrideB = NUM_THREADS / (BN / 4);
+
+  // allocate thread-local cache for results in registerfile
+  float threadResults[WMITER * TM * WNITER * TN] = {0.0};
+  // we cache into registers on the warptile level
+  float regM[WMITER * TM] = {0.0};
+  float regN[WNITER * TN] = {0.0};
+
+  int As_offset = 0;
+  int Bs_offset = 0;
+
+  // double-buffering: load first blocktile into SMEM
+  loadFromGmem<BM, BN, BK, rowStrideA, rowStrideB>(
+      N, K, A, B, As + As_offset * BM * BK, Bs + Bs_offset * BK * BN, innerRowA,
+      innerColA, innerRowB, innerColB, (*frontBarrierPtr));
+
+  // outer-most loop over block tiles
+  for (uint bkIdx = 0; bkIdx < K - BK; bkIdx += BK) {
+    // double-buffering: load next blocktile into SMEM
+    loadFromGmem<BM, BN, BK, rowStrideA, rowStrideB>(
+        N, K, A + BK, B + BK * N, As + (1 - As_offset) * BM * BK,
+        Bs + (1 - Bs_offset) * BK * BN, innerRowA, innerColA, innerRowB,
+        innerColB, (*backBarrierPtr));
+
+    // compute the current blocktile
+    (*frontBarrierPtr).arrive_and_wait();
+    processFromSmem<BM, BN, BK, WM, WN, WMITER, WNITER, WSUBM, WSUBN, TM, TN>(
+        regM, regN, threadResults, As + As_offset * BM * BK,
+        Bs + Bs_offset * BK * BN, warpRow, warpCol, threadRowInWarp,
+        threadColInWarp);
+    A += BK;     // move BK columns to right
+    B += BK * N; // move BK rows down
+
+    As_offset = 1 - As_offset;
+    Bs_offset = 1 - Bs_offset;
+    // swap the front and back barriers
+    auto tmp = frontBarrierPtr;
+    frontBarrierPtr = backBarrierPtr;
+    backBarrierPtr = tmp;
+
+    __syncthreads();
+  }
+
+  // compute the last blocktile
+  (*frontBarrierPtr).arrive_and_wait();
+  processFromSmem<BM, BN, BK, WM, WN, WMITER, WNITER, WSUBM, WSUBN, TM, TN>(
+      regM, regN, threadResults, As + As_offset * BM * BK,
+      Bs + Bs_offset * BK * BN, warpRow, warpCol, threadRowInWarp,
+      threadColInWarp);
+
+  // write out the results
+  for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
+    for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
+      // move C pointer to current warp subtile
+      float *C_interim = C + (wSubRowIdx * WSUBM) * N + wSubColIdx * WSUBN;
+      for (uint resIdxM = 0; resIdxM < TM; resIdxM += 1) {
+        for (uint resIdxN = 0; resIdxN < TN; resIdxN += 4) {
+          // load C vector into registers
+          float4 tmp = reinterpret_cast<float4 *>(
+              &C_interim[(threadRowInWarp * TM + resIdxM) * N +
+                         threadColInWarp * TN + resIdxN])[0];
+          // perform GEMM update in reg
+          const int i = (wSubRowIdx * TM + resIdxM) * (WNITER * TN) +
+                        wSubColIdx * TN + resIdxN;
+          tmp.x = alpha * threadResults[i + 0] + beta * tmp.x;
+          tmp.y = alpha * threadResults[i + 1] + beta * tmp.y;
+          tmp.z = alpha * threadResults[i + 2] + beta * tmp.z;
+          tmp.w = alpha * threadResults[i + 3] + beta * tmp.w;
+          // write back
+          reinterpret_cast<float4 *>(
+              &C_interim[(threadRowInWarp * TM + resIdxM) * N +
+                         threadColInWarp * TN + resIdxN])[0] = tmp;
+        }
+      }
+    }
+  }
+}
diff --git a/src/backend_gpu/math/matmul.cu b/src/backend_gpu/math/matmul.cu
index a1672ca..9c02673 100644
--- a/src/backend_gpu/math/matmul.cu
+++ b/src/backend_gpu/math/matmul.cu
@@ -3,15 +3,17 @@
 #include <stdlib.h>
 #include <math.h>
 #include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include <cuda/barrier>
+#include "../kernels/sgemm_double_buffer.cuh"
+#include <cublas_v2.h>
 
 #define BLOCK_SIZE 32
 
 __global__
 void sgemm_naive(float *A, float *B, float *C, uint32_t M, uint32_t K, uint32_t N) {
-    
     const int col = blockIdx.x * blockDim.x + threadIdx.x;
     const int row = blockIdx.y * blockDim.y + threadIdx.y;
-
     if (row < M && col < N) {
         float sum = 0.0f;
         for (int l = 0; l < K; l++) {
@@ -21,20 +23,53 @@ void sgemm_naive(float *A, float *B, float *C, uint32_t M, uint32_t K, uint32_t
     }
 }
 
-bool tensor_mul_op_cuda(tensor_t *t,float *d_a, float *d_b, float *d_res) {
+cublasHandle_t handle = NULL;
+bool tensor_mul_op_cuda(tensor_t *t, float *d_a, float *d_b, float *d_res) {
     uint32_t M = t->a->dims[0];
     uint32_t K = t->a->dims[1];
-    uint32_t N = t->b->dims[1];
-    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
-    dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
-    sgemm_naive<<<gridDim, blockDim>>>(d_a, d_b, d_res,M, K, N);
+    uint32_t N = t->b->is_transposed ? t->b->dims[0] : t->b->dims[1];
+    
+    float alpha = 1.0f, beta = 0.0f;
 
-    cudaError_t err = cudaDeviceSynchronize();
+    if (handle == NULL) {
+        cublasStatus_t s = cublasCreate(&handle);
+        if (s != CUBLAS_STATUS_SUCCESS) {
+            debug("tensor_mul_op_cuda: cublasCreate failed (%d)\n", (int)s);
+            return false;
+        }
+    }
 
-    if (err != cudaSuccess) {
-          debug("CUDA Add Kernel Failed: %s\n", cudaGetErrorString(err));
-          return false;
+    // if (M % 128 == 0 && N % 128 == 0 && K % 16 == 0) {
+    //     dim3 block(256);
+    //     dim3 grid(CEIL_DIV(N, 128), CEIL_DIV(M, 128));
+    //     runSgemmDoubleBuffering2<128, 128, 16, 64, 64, 4, 8, 4, 256>
+    //         <<<grid, block>>>(M, N, K, 1.0f, d_a, d_b, 0.0f, d_res);
+    // } else {
+    //     dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
+    //     dim3 gridDim(CEIL_DIV(N, BLOCK_SIZE), CEIL_DIV(M, BLOCK_SIZE));
+    //     sgemm_naive<<<gridDim, blockDim>>>(d_a, d_b, d_res, M, K, N);
+    // }
+
+    cublasStatus_t stat = cublasSgemm(
+        handle,
+        CUBLAS_OP_N, CUBLAS_OP_N,
+        (int)N, (int)M, (int)K,
+        &alpha,
+        d_b, (int)N,
+        d_a, (int)K,
+        &beta,
+        d_res, (int)N
+    );
+
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        debug("tensor_mul_op_cuda: cublasSgemm failed (%d)\n", (int)stat);
+        return false;
     }
 
+    cudaError_t err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        debug("CUDA Matmul Kernel Failed: %s\n", cudaGetErrorString(err));
+        return false;
+    }
     return true;
 }
diff --git a/src/core/graph/DAGbuild.cpp b/src/core/graph/DAGbuild.cpp
index 277c33c..63eec38 100644
--- a/src/core/graph/DAGbuild.cpp
+++ b/src/core/graph/DAGbuild.cpp
@@ -33,6 +33,10 @@ bool verifyIfDAG(tensor_pool_t *pool, tensor_t *t, std::vector<execution_node_t
     en->id = id;
     en->to_device_needed = false;
     en->device_ptr = NULL;
+    en->device_ptr_grad = NULL;
+    en->backend_fn = NULL;
+    en->parent_pos[0] = -1;
+    en->parent_pos[1] = -1;
     seq.push_back(en);
     return true;
 }
diff --git a/src/core/graph/assignBackend.cu b/src/core/graph/assignBackend.cu
index c4b8c41..402c77c 100644
--- a/src/core/graph/assignBackend.cu
+++ b/src/core/graph/assignBackend.cu
@@ -38,40 +38,79 @@ void assignBackendGpu(execution_node_t *e) {
 /* NOTE: Remember that when op is  tensor_op_t::NONE you assign it to CPU,
  *       The reason being that it would the dangling node.
  */
-device_type assignDevice([[maybe_unused]] uint8_t ndims, [[maybe_unused]] uint32_t *dims,
-                         [[maybe_unused]] tensor_op_t op, uint32_t nvalues, json &data) {
-    if(op == tensor_op_t::NONE) {
-        return device_type::CPU;
+/* -------------------------------------------------------------------------
+ * assignDevice  — looks up the CONFIG.soft JSON and returns the best
+ * backend for the given op and tensor size.
+ *
+ * JSON schema per op:
+ *   "op_name": [
+ *     { "min": 0, "max": 127, "backend": "cpu" },
+ *     { "min": 128, "max": 4294967295, "backend": "cuda" }
+ *   ]
+ *   OR
+ *   { "backend": "cuda" }  (no size range — always this backend)
+ *
+ * Returns device_type::CPU if:
+ *   • op is NONE (leaf node)
+ *   • op key not found in JSON
+ *   • no matching size range
+ * ---------------------------------------------------------------------*/
+device_type assignDevice([[maybe_unused]] uint8_t ndims,
+                         [[maybe_unused]] uint32_t *dims,
+                         tensor_op_t op, uint32_t nvalues, json &data) {
+    if (op == tensor_op_t::NONE) return device_type::CPU;
+
+    const char *key = nullptr;
+    switch (op) {
+        case tensor_op_t::ADD:              key = "add";           break;
+        case tensor_op_t::SUB:              key = "sub";           break;
+        case tensor_op_t::RELU:             key = "relu";          break;
+        case tensor_op_t::SQUARE:           key = "square";        break;
+        case tensor_op_t::MEAN:             key = "mean";          break;
+        case tensor_op_t::MUL_MATRIX:       key = "matmul";        break;
+        case tensor_op_t::NAIVE_MATRIX_MUL: key = "matmul";        break;
+        case tensor_op_t::MUL_SCALAR:       key = "mul_scalar";    break;
+        case tensor_op_t::BROADCAST_ADD:    key = "broadcast_add"; break;
+        case tensor_op_t::TRANSPOSE:        // always CPU         return device_type::CPU;
+        default:                                                    return device_type::CPU;
     }
+
     try {
+        if (!data.contains("ops") || !data["ops"].contains(key))
+            return device_type::CPU;
 
-        if (op == tensor_op_t::MUL_MATRIX) {
-            auto params = data["ops"]["matmul"];
-            for(auto param: params) {
-                if(param["min"] <= nvalues && param["max"] > nvalues) {
-                    if(param["backend"] == "cpu") {
-                        return device_type::CPU;
-                    } else {
-                        return device_type::GPU;
-                    }
-                }
-            }
-        }
-        if (op == tensor_op_t::ADD) {
-            auto params = data["ops"]["add"];
-            for(auto param: params) {
-                if(!param.contains("min") || (param["min"] <= nvalues && param["max"] > nvalues)) {
-                    if(param["backend"] == "cpu") {
-                        return device_type::CPU;
-                    } else {
-                        return device_type::GPU;
-                    }
+        auto &params = data["ops"][key];
+
+        // params may be an array of range objects or a single object 
+        auto lookup = [&](const json &entry) -> device_type {
+            bool in_range = !entry.contains("min") ||
+                            (entry["min"].get<uint32_t>() <= nvalues &&
+                             entry["max"].get<uint32_t>() > nvalues);
+            if (!in_range) return device_type::CPU; // sentinel — skip 
+            return (entry["backend"] == "cpu") ? device_type::CPU
+                                               : device_type::GPU;
+        };
+
+        if (params.is_array()) {
+            for (auto &p : params) {
+                if (p.contains("min") &&
+                    p["min"].get<uint32_t>() <= nvalues &&
+                    p["max"].get<uint32_t>() > nvalues) {
+                    return (p["backend"] == "cpu") ? device_type::CPU
+                                                   : device_type::GPU;
+                } else if (!p.contains("min")) {
+                    // No range -> applies to all sizes
+                    return (p["backend"] == "cpu") ? device_type::CPU
+                                                   : device_type::GPU;
                 }
             }
+        } else if (params.is_object()) {
+            return lookup(params);
         }
-    } catch (const std::exception& e) {
-       debug("Error: %s\n", e.what());
+    } catch (const std::exception &e) {
+        debug("assignDevice: JSON error: %s\n", e.what());
     }
+
     return device_type::CPU;
 }
 
@@ -154,9 +193,9 @@ void assignBackendGraph(tensor_pool_t *pool,std::vector<execution_node_t *> &nod
         }
     } else if (backend == backend_mode::HYBRID) {
         json data{};
-        std::string path{};
+        std::string path = ".config/soft-cuda/CONFIG.soft";
         if (getenv("HOME") != nullptr) {
-            static std::string path = std::string(getenv("HOME")) + "/.config/soft-cuda/CONFIG.soft";
+            path = std::string(getenv("HOME")) + "/" + path;
         }
         if (std::filesystem::exists(path)) {
             data = readJsonToMap(path); // passing the char pointer
@@ -176,6 +215,11 @@ changedToGPU:
             // TODO: How to get access to the execution_node place in graph when we just know the
             // tensor
             assignPlaceOnDeviceMemory(pool, (int32_t)(node->pos), nodes);
+            if (node->device_ptr == NULL) {
+                // If allocation failed, fall back to CPU
+                assignBackendCpu(node);
+                continue;
+            }
             if (node->t->a == NULL)
                 goto trp;
             if (node->t->a->device == device_type::CPU) {
@@ -244,7 +288,7 @@ bool tensor_graph_forward_evaluate(tensor_pool_t *pool_cpu, tensor_pool_t *pool_
 
                 // NOTE: we are totally ignoring the to_device_needed flag am i missing something which i thought i needed.
                 // TODO: Review this part once again maybe try for streams for better performance or whatever
-                if (parent_a->t->device == device_type::CPU) {
+                if (parent_a->t->device == device_type::CPU && parent_a->device_ptr != NULL) {
                     size_t size_a = nodes[(size_t)a_idx]->t->nvalues * sizeof(float) ;
                     cudaError_t err_a = cudaMemcpy(parent_a->device_ptr, parent_a->t->data, size_a, cudaMemcpyHostToDevice);
                     if (err_a != cudaSuccess) {
@@ -256,7 +300,7 @@ bool tensor_graph_forward_evaluate(tensor_pool_t *pool_cpu, tensor_pool_t *pool_
             int32_t b_idx = getTheExecutionNodeIndex(node, 1);
             if(b_idx != -1){
                 auto parent_b = nodes[(size_t)b_idx];
-                if (parent_b->t->device == device_type::CPU) {
+                if (parent_b->t->device == device_type::CPU && parent_b->device_ptr != NULL) {
                     size_t size_b = nodes[(size_t)b_idx]->t->nvalues * sizeof(float) ;
                     cudaError_t err_b = cudaMemcpy(parent_b->device_ptr, parent_b->t->data, size_b, cudaMemcpyHostToDevice);
                     if (err_b != cudaSuccess) {
@@ -332,15 +376,30 @@ void assignGradMemory(tensor_pool_t *pool_grad_cpu, tensor_pool_t *pool_grad_gpu
         // So we are going to make a tensor instance here and assign it here i guess. 
         
         if(node->t->grad_compute == false) continue;
-
-        node->t->grad =  tensor_dtype_create(pool_grad_cpu, node->t->dtype, node->t->ndims, node->t->dims, NULL);
-
+        node->t->grad = tensor_dtype_create(pool_grad_cpu, node->t->dtype, node->t->ndims, node->t->dims, NULL);
         if(node->backend_fn == tensor_evaluate_GPU) {
-            size_t size = node->t->nvalues * sizeof(float) ;
+            size_t size = node->t->nvalues * sizeof(float);
             uint32_t id;
             node->device_ptr_grad = tensor_pool_alloc(pool_grad_gpu, size, &id);
         }
     }
+
+    for (int i = (int)nodes.size() - 1; i >= 0; i--) {
+        execution_node_t *node = nodes[i];
+        if (node->device_ptr_grad == NULL) continue;
+
+        for (int p = 0; p < 2; p++) {
+            int32_t p_idx = getTheExecutionNodeIndex(node, p);
+            if (p_idx == -1) continue;
+            execution_node_t *parent = nodes[p_idx];
+            if (parent->t->grad_compute && parent->device_ptr_grad == NULL) {
+                size_t size = parent->t->nvalues * sizeof(float);
+                uint32_t id;
+                parent->device_ptr_grad = tensor_pool_alloc(pool_grad_gpu, size, &id);
+            }
+        }
+    }
+
 }
 
 // Since autograd is not ready I am making two function one for device to host copy and one which iterate over graph and transfer them back to cpu
@@ -354,11 +413,9 @@ void autogradGpuMemTranfer(std::vector<execution_node_t *> &nodes) {
     for(auto node : nodes) {
         if (node->backend_fn == tensor_evaluate_GPU) {
             // cause we do implict fallback to cpu in case we don't have that op on gpu
-            if (node->t->data == NULL) {
+            if (node->t->grad_compute && node->t->data != NULL) {
                 fromDeviceToHost(node);
             }
-            // Not doing backend_fn mutation cause then we will have to run assignBackend again and again
-            node->device_ptr_grad = NULL;
         }
     } 
 }
diff --git a/src/core/graph/train.cpp b/src/core/graph/train.cpp
index f0c01a9..df08c57 100644
--- a/src/core/graph/train.cpp
+++ b/src/core/graph/train.cpp
@@ -14,13 +14,37 @@ void tensor_sgd(std::vector<execution_node_t *> &nodes, float learning_rate) {
         if (!t->grad_compute)           continue;
         if (t->grad == NULL)            continue;
 
+        // If both weights and grads are on GPU, update on GPU
+        if (node->device_ptr != NULL && node->device_ptr_grad != NULL) {
+            if (tensor_sgd_gpu((float *)node->device_ptr, (float *)node->device_ptr_grad, 
+                               learning_rate, t->nvalues)) {
+                continue; // Success, skip CPU update
+            }
+        }
+
+        if (node->device_ptr_grad != NULL && t->grad->data != NULL) {
+            soft_cuda_memcpy_d2h(t->grad->data, node->device_ptr_grad,
+                                  t->grad->nvalues * sizeof(float));
+        }
+        
+        // If weights were on GPU, we might need to pull them too, 
+        // but leaf weights usually stay on host unless moved.
+        if (node->device_ptr != NULL && t->data != NULL) {
+             soft_cuda_memcpy_d2h(t->data, node->device_ptr,
+                                  t->nvalues * sizeof(float));
+        }
         float *w    = (float *)t->data;
         float *grad = (float *)t->grad->data;
         uint32_t n  = t->nvalues;
 
         for (uint32_t i = 0; i < n; i++) {
             w[i] -= learning_rate * grad[i];
-            // grad[i] = 0.0f;   
+        }
+
+        // If we updated on CPU but they are shadowed on GPU, push back
+        if (node->device_ptr != NULL && t->data != NULL) {
+             soft_cuda_memcpy_h2d(node->device_ptr, t->data,
+                                  t->nvalues * sizeof(float));
         }
     }
 }
diff --git a/src/core/include/graph/assignBackend.h b/src/core/include/graph/assignBackend.h
index 2547549..00e99b8 100644
--- a/src/core/include/graph/assignBackend.h
+++ b/src/core/include/graph/assignBackend.h
@@ -8,8 +8,8 @@ using json = nlohmann::json;
 void assignBackend(execution_node_t *e);
 
 device_type assignDevice(uint8_t ndims, uint32_t *dims, tensor_op_t op, uint32_t nvalues, json &data);
-
-int32_t getTheExecutionNodeIndex(uint32_t id, std::vector<execution_node_t *> &nodes);
+// returns -1 if no such parent
+int32_t getTheExecutionNodeIndex(execution_node_t *node, uint32_t idx);
 
 void assignBackendGraph(tensor_pool_t *pool, std::vector<execution_node_t *> &nodes);
 
diff --git a/src/init/config/profiler.cu b/src/init/config/profiler.cu
new file mode 100644
index 0000000..eb4ff6b
--- /dev/null
+++ b/src/init/config/profiler.cu
@@ -0,0 +1,9 @@
+#include "internal_header.h"
+
+int main(int argc, char **argv) {
+    const char *path = (argc > 1) ? argv[1] : nullptr;
+    printf("||==============================================||\n");
+    printf("||  soft-cuda AOT Hardware Profiler             ||\n");
+    printf("||==============================================||\n\n");
+    return soft_profile_and_write(path);
+}
diff --git a/src/init/config/profiler_core.cu b/src/init/config/profiler_core.cu
new file mode 100644
index 0000000..a2638e3
--- /dev/null
+++ b/src/init/config/profiler_core.cu
@@ -0,0 +1,486 @@
+/**
+ * soft-cuda  — AOT Hardware Profiler
+ *
+ * Measures real CPU vs GPU throughput for every supported op at several
+ * element-count breakpoints, finds the crossover threshold, then writes
+ * ~/.config/soft-cuda/CONFIG.soft so every subsequent run uses the
+ * optimal backend without measuring again.
+ *
+ * Usage (standalone binary built by CMake target `soft_profiler`):
+ *   ./soft_profiler                      # writes to default path
+ *   ./soft_profiler /path/to/CONFIG.soft # custom path
+ *
+ * Algorithm
+ * ---------
+ *  For each op O and each candidate size S (from SIZE_BREAKPOINTS):
+ *    cpu_ms  = median of REPS timed CPU executions
+ *    gpu_ms  = median of REPS timed GPU kernel + cudaDeviceSynchronize
+ *  The crossover for op O is the smallest S where gpu_ms < cpu_ms.
+ *  Below crossover → "cpu", at or above → "cuda".
+ *
+ * Staleness detection
+ * -------------------
+ *  We hash the CUDA device UUID.  If the file already exists and its
+ *  device_hash matches, we skip profiling entirely (fast path).
+ */
+
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <cuda_runtime.h>
+
+
+static constexpr int   WARMUP_REPS = 10;
+static constexpr int   BENCH_REPS  = 30;
+
+
+static const uint32_t SIZE_BREAKPOINTS[] = {
+    64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216
+};
+static constexpr int N_BREAKPOINTS = (int)(sizeof(SIZE_BREAKPOINTS) / sizeof(SIZE_BREAKPOINTS[0]));
+
+
+static std::string device_fingerprint() {
+    cudaDeviceProp prop{};
+    cudaGetDeviceProperties(&prop, 0);
+    char hex[33];
+    for (int i = 0; i < 16; i++) {
+        snprintf(hex + i * 2, 3, "%02x",
+                 (unsigned char)prop.uuid.bytes[i]);
+    }
+    return std::string(hex, 32);
+}
+
+
+static double now_ms() {
+    using namespace std::chrono;
+    return (double)duration_cast<nanoseconds>(
+               high_resolution_clock::now().time_since_epoch())
+               .count() * 1e-6;
+}
+
+static double gpu_event_ms(cudaEvent_t start, cudaEvent_t stop) {
+    float ms = 0.0f;
+    cudaEventElapsedTime(&ms, start, stop);
+    return (double)ms;
+}
+
+/* Return median of a sorted copy of v */
+static double median(std::vector<double> v) {
+    std::sort(v.begin(), v.end());
+    size_t n = v.size();
+    if (n % 2 == 0) return (v[n/2-1] + v[n/2]) / 2.0;
+    return v[n/2];
+}
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////   CPU KERNELS REF  ////////////////////////////////////
+///////////////// THESE SHOULD MIRROR THE ONE IN LIB ////////////////////
+static void cpu_add(const float *__restrict__ a,
+                    const float *__restrict__ b,
+                    float *__restrict__ out, uint32_t n) {
+    for (uint32_t i = 0; i < n; i++) out[i] = a[i] + b[i];
+}
+static void cpu_sub(const float *__restrict__ a,
+                    const float *__restrict__ b,
+                    float *__restrict__ out, uint32_t n) {
+    for (uint32_t i = 0; i < n; i++) out[i] = a[i] - b[i];
+}
+static void cpu_relu(const float *__restrict__ a,
+                     float *__restrict__ out, uint32_t n) {
+    for (uint32_t i = 0; i < n; i++) out[i] = a[i] > 0.f ? a[i] : 0.f;
+}
+static void cpu_square(const float *__restrict__ a,
+                       float *__restrict__ out, uint32_t n) {
+    for (uint32_t i = 0; i < n; i++) out[i] = a[i] * a[i];
+}
+static void cpu_mean(const float *__restrict__ a, uint32_t n, float *out) {
+    double s = 0.0;
+    for (uint32_t i = 0; i < n; i++) s += a[i];
+    *out = (float)(s / n);
+}
+
+static void cpu_matmul(const float *A, const float *B, float *C,
+                       uint32_t M, uint32_t K, uint32_t N) {
+    for (uint32_t i = 0; i < M; i++)
+        for (uint32_t j = 0; j < N; j++) {
+            float s = 0.f;
+            for (uint32_t k = 0; k < K; k++)
+                s += A[i*K+k] * B[k*N+j];
+            C[i*N+j] = s;
+        }
+}
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////   GPU KERNELS REF  ////////////////////////////////////
+///////////////// THESE SHOULD MIRROR THE ONE IN LIB ////////////////////
+
+__global__ static void gpu_add_k(const float *a, const float *b,
+                                  float *out, uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride) out[i] = a[i] + b[i];
+}
+__global__ static void gpu_sub_k(const float *a, const float *b,
+                                  float *out, uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride) out[i] = a[i] - b[i];
+}
+__global__ static void gpu_relu_k(const float *a, float *out, uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride) out[i] = a[i] > 0.f ? a[i] : 0.f;
+}
+__global__ static void gpu_square_k(const float *a, float *out, uint32_t n) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t stride = blockDim.x * gridDim.x;
+    for (; i < n; i += stride) out[i] = a[i] * a[i];
+}
+
+__global__ static void gpu_reduce_k(const float *in, float *partial,
+                                     uint32_t n) {
+    extern __shared__ float smem[];
+    uint32_t tid = threadIdx.x;
+    uint32_t i   = blockIdx.x * blockDim.x + tid;
+    smem[tid] = (i < n) ? in[i] : 0.f;
+    __syncthreads();
+    for (uint32_t s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) smem[tid] += smem[tid + s];
+        __syncthreads();
+    }
+    if (tid == 0) partial[blockIdx.x] = smem[0];
+}
+__global__ static void gpu_matmul_k(const float *A, const float *B,
+                                     float *C,
+                                     uint32_t M, uint32_t K, uint32_t N) {
+    uint32_t col = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t row = blockIdx.y * blockDim.y + threadIdx.y;
+    if (row < M && col < N) {
+        float s = 0.f;
+        for (uint32_t l = 0; l < K; l++) s += A[row*K+l] * B[l*N+col];
+        C[row*N+col] = s;
+    }
+}
+
+struct BenchResult { double cpu_ms, gpu_ms; };
+
+static void fill(float *p, uint32_t n, float base) {
+    for (uint32_t i = 0; i < n; i++) p[i] = base + (float)(i % 256) * 0.001f;
+}
+
+enum class OpKind { ADD, SUB, RELU, SQUARE, MEAN, MATMUL };
+
+static BenchResult bench_elementwise(OpKind op, uint32_t n,
+                                     float *h_a, float *h_b, float *h_out,
+                                     float *d_a, float *d_b, float *d_out) {
+    std::vector<double> cpu_times;
+    cpu_times.reserve(BENCH_REPS);
+    for (int r = 0; r < WARMUP_REPS + BENCH_REPS; r++) {
+        double t0 = now_ms();
+        switch (op) {
+            case OpKind::ADD:    cpu_add(h_a, h_b, h_out, n); break;
+            case OpKind::SUB:    cpu_sub(h_a, h_b, h_out, n); break;
+            case OpKind::RELU:   cpu_relu(h_a, h_out, n);      break;
+            case OpKind::SQUARE: cpu_square(h_a, h_out, n);    break;
+            case OpKind::MEAN:   cpu_mean(h_a, n, h_out);      break;
+            default: break;
+        }
+        double t1 = now_ms();
+        if (r >= WARMUP_REPS) cpu_times.push_back(t1 - t0);
+    }
+
+    cudaEvent_t ev0, ev1;
+    cudaEventCreate(&ev0); cudaEventCreate(&ev1);
+
+    int block = 256;
+    int grid  = ((int)n + block - 1) / block;
+
+    std::vector<double> gpu_times;
+    gpu_times.reserve(BENCH_REPS);
+    for (int r = 0; r < WARMUP_REPS + BENCH_REPS; r++) {
+        cudaEventRecord(ev0);
+        switch (op) {
+            case OpKind::ADD:
+                gpu_add_k<<<grid, block>>>(d_a, d_b, d_out, n); break;
+            case OpKind::SUB:
+                gpu_sub_k<<<grid, block>>>(d_a, d_b, d_out, n); break;
+            case OpKind::RELU:
+                gpu_relu_k<<<grid, block>>>(d_a, d_out, n);      break;
+            case OpKind::SQUARE:
+                gpu_square_k<<<grid, block>>>(d_a, d_out, n);    break;
+            case OpKind::MEAN: {
+                /* Two-pass reduction */
+                int g2 = (int)(((uint32_t)grid + (uint32_t)block - 1) / (uint32_t)block);
+                float *d_partial = nullptr;
+                cudaMalloc(&d_partial, (size_t)grid * sizeof(float));
+                gpu_reduce_k<<<grid, block, (size_t)block * sizeof(float)>>>(
+                    d_a, d_partial, n);
+                gpu_reduce_k<<<g2, block, (size_t)block * sizeof(float)>>>(
+                    d_partial, d_out, (uint32_t)grid);
+                cudaFree(d_partial);
+                break;
+            }
+            default: break;
+        }
+        cudaEventRecord(ev1);
+        cudaEventSynchronize(ev1);
+        if (r >= WARMUP_REPS)
+            gpu_times.push_back(gpu_event_ms(ev0, ev1));
+    }
+    cudaEventDestroy(ev0); cudaEventDestroy(ev1);
+    return { median(cpu_times), median(gpu_times) };
+}
+
+
+static BenchResult bench_matmul(uint32_t M, uint32_t K, uint32_t N,
+                                 float *h_a, float *h_b, float *h_out,
+                                 float *d_a, float *d_b, float *d_out) {
+    std::vector<double> cpu_times;
+    for (int r = 0; r < WARMUP_REPS + BENCH_REPS; r++) {
+        double t0 = now_ms();
+        cpu_matmul(h_a, h_b, h_out, M, K, N);
+        double t1 = now_ms();
+        if (r >= WARMUP_REPS) cpu_times.push_back(t1 - t0);
+    }
+    cudaEvent_t ev0, ev1;
+    cudaEventCreate(&ev0); cudaEventCreate(&ev1);
+    dim3 blk(32, 32);
+    dim3 grd((N+31)/32, (M+31)/32);
+    std::vector<double> gpu_times;
+    for (int r = 0; r < WARMUP_REPS + BENCH_REPS; r++) {
+        cudaEventRecord(ev0);
+        gpu_matmul_k<<<grd, blk>>>(d_a, d_b, d_out, M, K, N);
+        cudaEventRecord(ev1);
+        cudaEventSynchronize(ev1);
+        if (r >= WARMUP_REPS)
+            gpu_times.push_back(gpu_event_ms(ev0, ev1));
+    }
+    cudaEventDestroy(ev0); cudaEventDestroy(ev1);
+    return { median(cpu_times), median(gpu_times) };
+}
+
+
+
+static uint32_t find_crossover(OpKind op, uint32_t max_n,
+                                float *h_a, float *h_b, float *h_out,
+                                float *d_a, float *d_b, float *d_out) {
+    uint32_t crossover = UINT32_MAX;
+    for (int bi = 0; bi < N_BREAKPOINTS; bi++) {
+        uint32_t n = SIZE_BREAKPOINTS[bi];
+        if (n > max_n) break;
+        BenchResult r = bench_elementwise(op, n, h_a, h_b, h_out, d_a, d_b, d_out);
+        printf("      n=%-9u  cpu=%.4f ms  gpu=%.4f ms  winner=%s\n",
+               n, r.cpu_ms, r.gpu_ms, r.gpu_ms < r.cpu_ms ? "GPU" : "CPU");
+        if (r.gpu_ms < r.cpu_ms && crossover == UINT32_MAX) {
+            crossover = n;
+            break; 
+        }
+    }
+    return crossover;
+}
+
+
+static uint32_t find_crossover_matmul(uint32_t max_n,
+                                       float *h_a, float *h_b, float *h_out,
+                                       float *d_a, float *d_b, float *d_out) {
+    uint32_t crossover = UINT32_MAX;
+    for (int bi = 0; bi < N_BREAKPOINTS; bi++) {
+        uint32_t side = (uint32_t)sqrtf((float)SIZE_BREAKPOINTS[bi]);
+        if (side < 2) side = 2;
+        uint32_t n = side * side; 
+        if (n * 3 > max_n) break;
+        BenchResult r = bench_matmul(side, side, side,
+                                      h_a, h_b, h_out, d_a, d_b, d_out);
+        printf("      side=%-5u n=%-9u  cpu=%.4f ms  gpu=%.4f ms  winner=%s\n",
+               side, n, r.cpu_ms, r.gpu_ms, r.gpu_ms < r.cpu_ms ? "GPU" : "CPU");
+        if (r.gpu_ms < r.cpu_ms && crossover == UINT32_MAX) {
+            crossover = n;
+            break;
+        }
+    }
+    return crossover;
+}
+
+
+
+struct OpThreshold {
+    const char *name;
+    uint32_t    crossover; 
+};
+
+static void write_config(const std::string &path,
+                         const std::string &dev_hash,
+                         const char *dev_name,
+                         double cc, size_t vram_mb,
+                         const std::vector<OpThreshold> &thresholds) {
+    std::ofstream f(path);
+    if (!f.is_open()) {
+        fprintf(stderr, "[profiler] ERROR: cannot write %s\n", path.c_str());
+        return;
+    }
+
+    time_t now = time(nullptr);
+    char ts[32];
+    strftime(ts, sizeof(ts), "%Y-%m-%dT%H:%M:%SZ", gmtime(&now));
+
+    f << "{\n";
+    f << "  \"meta\": {\n";
+    f << "    \"soft_version\": \"0.1.0\",\n";
+    f << "    \"device_hash\": \"" << dev_hash << "\",\n";
+    f << "    \"generated_at\": \"" << ts << "\"\n";
+    f << "  },\n";
+    f << "  \"device\": {\n";
+    f << "    \"name\": \"" << dev_name << "\",\n";
+    f << "    \"type\": \"cuda\",\n";
+    f << "    \"compute_capability\": " << cc << ",\n";
+    f << "    \"vram_mb\": " << vram_mb << "\n";
+    f << "  },\n";
+    f << "  \"ops\": {\n";
+
+    for (size_t i = 0; i < thresholds.size(); i++) {
+        const auto &t = thresholds[i];
+        f << "    \"" << t.name << "\": [\n";
+        if (t.crossover == UINT32_MAX) {
+            f << "      { \"backend\": \"cpu\" }\n";
+        } else if (t.crossover == 0) {
+            f << "      { \"backend\": \"cuda\" }\n";
+        } else {
+            f << "      { \"min\": 0, \"max\": " << t.crossover
+              << ", \"backend\": \"cpu\" },\n";
+            f << "      { \"min\": " << t.crossover
+              << ", \"max\": 4294967295, \"backend\": \"cuda\" }\n";
+        }
+        f << "    ]";
+        if (i + 1 < thresholds.size()) f << ",";
+        f << "\n";
+    }
+
+    f << "  },\n";
+    f << "  \"pool\": {\n";
+    f << "    \"device\": \"cuda\",\n";
+    f << "    \"block_size\": 2097152\n";
+    f << "  }\n";
+    f << "}\n";
+    f.close();
+    printf("[profiler] Config written to: %s\n", path.c_str());
+}
+
+/* -----------------------------------------------------------------------
+ * Public entry point — called by soft_profiler binary and optionally by
+ * the library's sc_init() if no config exists.
+ * ----------------------------------------------------------------------- */
+int soft_profile_and_write(const char *out_path) {
+    // Detect CUDA device 
+    int device_count = 0;
+    cudaGetDeviceCount(&device_count);
+    if (device_count == 0) {
+        fprintf(stderr, "[profiler] No CUDA devices found — writing CPU-only config.\n");
+        // Write a minimal CPU-only config 
+        //
+        return 1;
+    }
+
+    cudaDeviceProp prop{};
+    cudaGetDeviceProperties(&prop, 0);
+    cudaSetDevice(0);
+
+    printf("[profiler] Device: %s  CC=%d.%d  VRAM=%zu MB\n",
+           prop.name,
+           prop.major, prop.minor,
+           prop.totalGlobalMem / (1024*1024));
+
+    std::string dev_hash = device_fingerprint();
+    double cc = prop.major + prop.minor * 0.1;
+    size_t vram_mb = prop.totalGlobalMem / (1024*1024);
+
+    /// Check if existing config is latest
+    if (out_path && std::filesystem::exists(out_path)) {
+        std::ifstream f(out_path);
+        std::string content((std::istreambuf_iterator<char>(f)),
+                             std::istreambuf_iterator<char>());
+        if (content.find(dev_hash) != std::string::npos) {
+            printf("[profiler] Config is up-to-date (device_hash matches). Skipping.\n");
+            return 0;
+        }
+    }
+
+    uint32_t max_elems = (uint32_t)std::min(
+        (size_t)16777216UL,   
+        prop.totalGlobalMem / (8UL * sizeof(float))
+    );
+    printf("[profiler] Probing up to %u elements per op\n\n", max_elems);
+
+    float *h_a   = (float *)malloc(max_elems * sizeof(float));
+    float *h_b   = (float *)malloc(max_elems * sizeof(float));
+    float *h_out = (float *)malloc(max_elems * sizeof(float));
+    fill(h_a,   max_elems, 1.5f);
+    fill(h_b,   max_elems, 0.7f);
+    memset(h_out, 0, max_elems * sizeof(float));
+
+    float *d_a, *d_b, *d_out;
+    cudaMalloc(&d_a,   max_elems * sizeof(float));
+    cudaMalloc(&d_b,   max_elems * sizeof(float));
+    cudaMalloc(&d_out, max_elems * sizeof(float));
+    cudaMemcpy(d_a, h_a, max_elems * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, h_b, max_elems * sizeof(float), cudaMemcpyHostToDevice);
+
+    gpu_add_k<<<256, 256>>>(d_a, d_b, d_out, 1024);
+    cudaDeviceSynchronize();
+
+    std::vector<OpThreshold> thresholds;
+
+    struct { const char *name; OpKind kind; } ops[] = {
+        {"add",    OpKind::ADD},
+        {"sub",    OpKind::SUB},
+        {"relu",   OpKind::RELU},
+        {"square", OpKind::SQUARE},
+        {"mean",   OpKind::MEAN},
+    };
+
+    for (auto &o : ops) {
+        printf("  [%s]\n", o.name);
+        uint32_t xover = find_crossover(o.kind, max_elems,
+                                         h_a, h_b, h_out, d_a, d_b, d_out);
+        printf("  → crossover at n=%u\n\n", xover);
+        thresholds.push_back({o.name, xover});
+    }
+
+    printf("  [matmul]\n");
+    uint32_t mm_xover = find_crossover_matmul(max_elems,
+                                               h_a, h_b, h_out, d_a, d_b, d_out);
+    printf("  → crossover at n=%u\n\n", mm_xover);
+    thresholds.push_back({"matmul", mm_xover});
+
+    thresholds.push_back({"mul_scalar", thresholds[0].crossover});
+    thresholds.push_back({"broadcast_add", thresholds[0].crossover});
+
+    cudaFree(d_a); cudaFree(d_b); cudaFree(d_out);
+    free(h_a); free(h_b); free(h_out);
+
+    std::string final_path;
+    if (out_path && out_path[0] != '\0') {
+        final_path = out_path;
+    } else {
+        const char *home = getenv("HOME");
+        if (!home) home = "/tmp";
+        final_path = std::string(home) + "/.config/soft-cuda/CONFIG.soft";
+    }
+
+    std::filesystem::create_directories(
+        std::filesystem::path(final_path).parent_path());
+
+    write_config(final_path, dev_hash, prop.name, cc, vram_mb, thresholds);
+    return 0;
+}
+
diff --git a/src/internal_header.h b/src/internal_header.h
index 0aecf18..43638bc 100644
--- a/src/internal_header.h
+++ b/src/internal_header.h
@@ -3,6 +3,7 @@
 #include "../include/soft-cuda/tensor/api.h"
 #include "../include/soft-cuda/tensor/debug_api.h"
 #include "../include/soft-cuda/python/soft_cuda_python.h"
+#include "../include/soft-cuda/profiler/profiler_core.h"
 
 // PRIVATE DECLARATIONS
 #include "./core/include/tensor/tensor.h"
@@ -30,4 +31,5 @@
 #include "backend_gpu/include/math/mean.h"
 #include "backend_gpu/include/math/scalar_mul.h"
 #include "backend_gpu/include/math/broadcast_add.h"
+#include "backend_gpu/include/backprop/backprop_gpu.h"
 // clang-format on
diff --git a/src/python/sc_bridge.cpp b/src/python/sc_bridge.cpp
index c51f21e..35c51d4 100644
--- a/src/python/sc_bridge.cpp
+++ b/src/python/sc_bridge.cpp
@@ -295,11 +295,19 @@ extern "C" sc_graph_t *sc_build_graph(sc_pool_t *meta_pool,
     sc_graph_t *g = sc_graph_create();
     if (g == nullptr) return nullptr;
 
+    if (meta_pool == nullptr || pool_gpu == nullptr || pool_grad_cpu == nullptr || pool_grad_gpu == nullptr) {
+        sc_graph_destroy(g);
+        return nullptr;
+    }
+
     if (!verifyIfDAG(meta_pool, loss, g->nodes)) {
         sc_graph_destroy(g);
         return nullptr;
     }
 
+
+    setUpParentReference(g->nodes);
+
     assignBackendGraph(pool_gpu, g->nodes, to_backend_mode(backend_mode_val));
     assignGradMemory(pool_grad_cpu, pool_grad_gpu, g->nodes);
 
@@ -312,7 +320,7 @@ extern "C" void sc_graph_step(sc_pool_t *pool_cpu,
                               float learning_rate) {
     if (g == nullptr) return;
     tensor_graph_forward_evaluate(pool_cpu, pool_gpu, g->nodes);
-    autogradGpuMemTranfer(g->nodes);
+    //autogradGpuMemTranfer(g->nodes); Not needed anymore cause we good
     gradInitializer(g->nodes);
     tensor_graph_backward(g->nodes);
     tensor_sgd(g->nodes, learning_rate);
diff --git a/temp b/temp
new file mode 100644
index 0000000..1a218b9
--- /dev/null
+++ b/temp
@@ -0,0 +1,31 @@
+
+============== OUTPUT OF echo * ============================
+
+benchmarks build build_wsl CMakeLists.txt compile_commands.json docs include main.cpp make.sh model.bin notes README.md requirements.txt scripts src summary.txt tests
+
+
+============== OUTPUT OF ls -a ==============================
+.cache
+.git
+.github
+.venv
+benchmarks
+build
+build_wsl
+docs
+include
+notes
+scripts
+src
+tests
+.clang-format
+.gitignore
+CMakeLists.txt
+compile_commands.json -> build/compile_commands.json
+main.cpp
+make.sh
+model.bin
+README.md
+requirements.txt
+summary.txt
+temp