Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ check.txt
[Dd]esktop.ini

.venv/

build_wsl
# Recycle Bin used on file shares
$RECYCLE.BIN/

Expand Down
31 changes: 27 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ project(soft-cuda LANGUAGES CXX CUDA)
# Core library sources — explicitly exclude the Python bridge subdir
file(GLOB_RECURSE LIB_SOURCE src/*.cu src/*.cpp src/*.hpp src/*.h)
list(FILTER LIB_SOURCE EXCLUDE REGEX ".*/python/.*")
list(FILTER LIB_SOURCE EXCLUDE REGEX ".*/profiler\.cu$")

set(CMAKE_CUDA_ARCHITECTURES native)

add_library(soft_lib SHARED ${LIB_SOURCE})
target_link_libraries(soft_lib PRIVATE cublas)

set_target_properties(soft_lib PROPERTIES
CXX_STANDARD 17
Expand All @@ -19,6 +21,7 @@ target_include_directories(soft_lib
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/backend_cpu/include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src"
PRIVATE "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
)

target_compile_options(soft_lib PRIVATE
Expand All @@ -37,16 +40,16 @@ set_target_properties(soft_lib PROPERTIES
# target_compile_options(soft_lib PUBLIC
# $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CXX>>:-fsanitize=address>
# )
target_compile_options(soft_lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g>)
target_compile_options(soft_lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g --expt-relaxed-constexpr>)
#
# target_link_options(soft_lib PUBLIC
# $<$<CONFIG:Debug>:-fsanitize=address>
# )


# ─────────────────────────────────────────────────────────────────────────────

# Python C bridge — flat-C shared library for use with ctypes / cffi
# ─────────────────────────────────────────────────────────────────────────────

add_library(soft_cuda_python SHARED src/python/sc_bridge.cpp)

set_target_properties(soft_cuda_python PROPERTIES
Expand All @@ -65,11 +68,31 @@ target_include_directories(soft_cuda_python
add_executable(soft main.cpp)
target_link_libraries(soft_cuda_python PRIVATE soft_lib)
target_link_libraries(soft PRIVATE soft_cuda_python)

# Ensure all sc_* symbols are exported on Linux/macOS
if(NOT WIN32)
target_compile_options(soft_cuda_python PRIVATE -fvisibility=default)
endif()

enable_testing()
add_subdirectory(tests)

# HARDWARE PROFILER

add_executable(soft_profiler src/init/config/profiler.cu)
set_target_properties(soft_profiler PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
target_include_directories(soft_profiler
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src"
)
target_link_libraries(soft_profiler PRIVATE soft_lib)

if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/CMakeLists.txt")
add_subdirectory(benchmarks)
endif()


33 changes: 33 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
cmake_minimum_required(VERSION 3.16)

# C++ benchmarks against soft_lib
add_executable(bench_softcuda bench_softcuda.cpp)
set_target_properties(bench_softcuda PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
target_include_directories(bench_softcuda
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src/core/include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src"
)
target_link_libraries(bench_softcuda PRIVATE soft_cuda_python)
target_include_directories(bench_softcuda PRIVATE
"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
)
target_link_directories(bench_softcuda PRIVATE
"/opt/cuda/targets/x86_64-linux/lib"
)
target_link_libraries(bench_softcuda PRIVATE cublas)
target_link_libraries(bench_softcuda PRIVATE cublas cudart)
add_executable(bench_deep_mlp bench_deep_mlp.cpp)
set_target_properties(bench_deep_mlp PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
target_include_directories(bench_deep_mlp
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src/core/include"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../src"
)
target_link_libraries(bench_deep_mlp PRIVATE soft_cuda_python)
137 changes: 137 additions & 0 deletions benchmarks/bench_deep_mlp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/**
* bench_deep_mlp.cpp
*
* Implements a 4-layer Deep MLP to stress-test the Hybrid Dispatcher.
* Network: Input (784) -> Hidden1 (512) -> Hidden2 (256) -> Hidden3 (128) -> Output (10)
*
* This benchmark demonstrates:
* 1. Correct routing of small vs large layers.
* 2. Persistence of GPU memory across 1000 iterations.
* 3. Hybrid execution benefit (dispatching compute-heavy layers to GPU,
* and memory-bound/small layers to CPU).
*/

#include "soft-cuda/tensor/api.h"
#include "soft-cuda/python/soft_cuda_python.h"

#include <chrono>
#include <cstdio>
#include <vector>
#include <cmath>
#include <cassert>

static double now_ms() {
using namespace std::chrono;
return (double)duration_cast<nanoseconds>(
high_resolution_clock::now().time_since_epoch())
.count() * 1e-6;
}

struct Layer {
sc_tensor_t *W;
sc_tensor_t *b;
};

static Layer create_layer(sc_pool_t *pool, uint32_t in_dim, uint32_t out_dim) {
uint32_t dW[] = {in_dim, out_dim};
uint32_t db[] = {1, out_dim};
sc_tensor_t *W = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dW, NULL, 1);
sc_tensor_t *b = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, db, NULL, 1);
sc_tensor_fill_random_normal(W, 0.0f, sqrtf(2.0f / (float)in_dim)); // He init
sc_tensor_fill_random_normal(b, 0.0f, 0.01f);
return {W, b};
}

static sc_tensor_t* forward_layer(sc_pool_t *pool, sc_tensor_t *X, Layer &L, bool use_relu = true) {
sc_tensor_t *mat = sc_tensor_mul_naive(pool, X, L.W);
sc_tensor_t *add = sc_tensor_add(pool, mat, L.b);
if (use_relu) return sc_tensor_relu(pool, add);
return add;
}

void run_mlp_bench(int backend_mode, const char *label, int epochs = 100) {
printf("--- Benchmarking %s ---\n", label);

const uint32_t BATCH = 64;
const uint32_t D_IN = 784, D1 = 512, D2 = 256, D3 = 128, D_OUT = 10;

sc_pool_t *pool = sc_pool_create(128 * 1024 * 1024, 0); // 128MB CPU
sc_pool_t *meta = sc_pool_create(8 * 1024 * 1024, 0); // 8MB Meta
sc_pool_t *gpc = sc_pool_create(32 * 1024 * 1024, 0); // 32MB Grad CPU

bool use_gpu = (backend_mode != SC_BACKEND_CPU);
sc_pool_t *gpg = sc_pool_create(use_gpu ? 32 * 1024 * 1024 : 1024, use_gpu ? 1 : 0);
sc_pool_t *pgpu = sc_pool_create(use_gpu ? 128 * 1024 * 1024 : 1024, use_gpu ? 1 : 0);

if (!pool || !meta || !gpc || !gpg || !pgpu) {
printf("FAILED to allocate pools (backend_mode=%d)\n", backend_mode);
if (pool) sc_pool_destroy(pool);
if (meta) sc_pool_destroy(meta);
if (gpc) sc_pool_destroy(gpc);
if (gpg) sc_pool_destroy(gpg);
if (pgpu) sc_pool_destroy(pgpu);
return;
}

uint32_t dX[] = {BATCH, D_IN};
uint32_t dY[] = {BATCH, D_OUT};
sc_tensor_t *X = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dX, NULL, 0);
sc_tensor_t *Y = sc_tensor_create(pool, SC_DTYPE_FLOAT32, 2, dY, NULL, 0);
sc_tensor_fill_random_normal(X, 0.5f, 0.2f);
sc_tensor_fill_random_normal(Y, 0.1f, 0.05f);

Layer L1 = create_layer(pool, D_IN, D1);
Layer L2 = create_layer(pool, D1, D2);
Layer L3 = create_layer(pool, D2, D3);
Layer L4 = create_layer(pool, D3, D_OUT);

// Build Graph
sc_tensor_t *H1 = forward_layer(pool, X, L1);
sc_tensor_t *H2 = forward_layer(pool, H1, L2);
sc_tensor_t *H3 = forward_layer(pool, H2, L3);
sc_tensor_t *Yp = forward_layer(pool, H3, L4, false);

sc_tensor_t *diff = sc_tensor_sub(pool, Yp, Y);
sc_tensor_t *sq = sc_tensor_square(pool, diff);
sc_tensor_t *loss = sc_tensor_mean(pool, sq);

sc_graph_t *g = sc_build_graph(meta, pgpu, gpc, gpg, loss, backend_mode);
if (!g) {
printf("FAILED to build graph\n");
return;
}

// Warmup
sc_graph_step(pool, pgpu, g, 0.01f);

double t0 = now_ms();
for (int i = 0; i < epochs; i++) {
sc_graph_step(pool, pgpu, g, 0.01f);
if ((i+1) % (epochs/5) == 0) {
printf(" Epoch %d/%d | Loss: %.6f\n", i+1, epochs, sc_graph_get_loss(g));
}
}
double elapsed = now_ms() - t0;

printf(" [RESULT] Total: %.2f ms | Avg: %.2f ms/step\n\n", elapsed, elapsed / epochs);

sc_graph_destroy(g);
sc_pool_destroy(pool); sc_pool_destroy(meta);
sc_pool_destroy(gpc); sc_pool_destroy(gpg); sc_pool_destroy(pgpu);
}

int main() {
printf("\n--- Deep MLP Benchmark (4-Layer, Hybrid Dispatch) ---\n\n");

// CPU Reference
run_mlp_bench(SC_BACKEND_CPU, "CPU-Only Backend (Baseline)");

// GPU Reference
run_mlp_bench(SC_BACKEND_GPU, "GPU-Only Backend (Full Acceleration)");

// Hybrid Dispatch (AOT-Profiled)
// This will use thresholds from CONFIG.soft
run_mlp_bench(SC_BACKEND_HYBRID, "HYBRID Backend (AOT-Optimized)");

return 0;
}
Loading
Loading