Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions RayTracing/src/CUDARenderer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ extern "C"
struct CUDARenderState
{
float4* d_SampleBuffer; // Device per-pixel sample buffer (raw path trace output)
float4* d_DenoiseBuffer; // Device per-pixel denoise buffer (averaged HDR, for OptiX)
float4* d_AccumulationBuffer; // Device accumulation buffer
uint32_t* d_OutputImage; // Device output RGBA buffer

Expand Down Expand Up @@ -46,6 +47,7 @@ CUDARenderState* CUDARenderer_Create()
state->initialized = false;

state->d_SampleBuffer = nullptr;
state->d_DenoiseBuffer = nullptr;
state->d_AccumulationBuffer = nullptr;
state->d_OutputImage = nullptr;
state->d_Spheres = nullptr;
Expand All @@ -67,6 +69,7 @@ void CUDARenderer_Destroy(CUDARenderState const* state)
if (!state) return;

if (state->d_SampleBuffer) cudaFree(state->d_SampleBuffer);
if (state->d_DenoiseBuffer) cudaFree(state->d_DenoiseBuffer);
if (state->d_AccumulationBuffer) cudaFree(state->d_AccumulationBuffer);
if (state->d_OutputImage) cudaFree(state->d_OutputImage);
if (state->d_Spheres) cudaFree(state->d_Spheres);
Expand Down Expand Up @@ -129,12 +132,14 @@ void CUDARenderer_OnResize(CUDARenderState* state, uint32_t width, uint32_t heig

// Free old buffers
if (state->d_SampleBuffer) cudaFree(state->d_SampleBuffer);
if (state->d_DenoiseBuffer) cudaFree(state->d_DenoiseBuffer);
if (state->d_AccumulationBuffer) cudaFree(state->d_AccumulationBuffer);
if (state->d_OutputImage) cudaFree(state->d_OutputImage);
if (state->d_RayDirections) cudaFree(state->d_RayDirections);

// Allocate new buffers
cudaMalloc(&state->d_SampleBuffer, state->pixelCount * sizeof(float4));
cudaMalloc(&state->d_DenoiseBuffer, state->pixelCount * sizeof(float4));
cudaMalloc(&state->d_AccumulationBuffer, state->pixelCount * sizeof(float4));
cudaMalloc(&state->d_OutputImage, state->pixelCount * sizeof(uint32_t));
cudaMalloc(&state->d_RayDirections, state->pixelCount * sizeof(float3));
Expand Down Expand Up @@ -272,6 +277,7 @@ void CUDARenderer_Render(
PostProcessKernel<<<ppBlocks, ppThreads, 0, state->computeStream>>>(
state->d_SampleBuffer,
state->d_AccumulationBuffer,
state->d_DenoiseBuffer,
state->d_OutputImage,
frameIndex,
state->pixelCount
Expand Down Expand Up @@ -299,6 +305,31 @@ void CUDARenderer_GetOutput(
byteSize, cudaMemcpyDeviceToHost);
}

void* CUDARenderer_GetDenoiseBuffer(CUDARenderState* state)
{
if (!state || !state->initialized) return nullptr;
return state->d_DenoiseBuffer;
}

void CUDARenderer_ConvertDenoisedToRGBA(CUDARenderState* state, cudaStream_t stream)
{
if (!state || !state->initialized || state->pixelCount == 0) return;

int threads = 256;
int blocks = (state->pixelCount + threads - 1) / threads;
ConvertToRGBAKernel<<<blocks, threads, 0, stream>>>(
(const float4*)state->d_DenoiseBuffer,
state->d_OutputImage,
state->pixelCount
);
}
Comment on lines +314 to +325

void* CUDARenderer_GetComputeStream(CUDARenderState* state)
{
if (!state) return nullptr;
return state->computeStream;
}

void CUDARenderer_SetSettings(
CUDARenderState* state,
int maxBounces)
Expand Down
33 changes: 32 additions & 1 deletion RayTracing/src/CUDARenderer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ __launch_bounds__(256, 4)
__global__ void PostProcessKernel(
const float4* __restrict__ sampleBuffer,
float4* __restrict__ accumulationBuffer,
float4* __restrict__ denoiseBuffer,
uint32_t* __restrict__ outputImage,
uint32_t frameIndex,
uint32_t pixelCount)
Expand All @@ -291,7 +292,7 @@ __global__ void PostProcessKernel(
accumulated.x += sample.x;
accumulated.y += sample.y;
accumulated.z += sample.z;
accumulated.w += 1.0f; // sample count
accumulated.w += 1.0f;
accumulationBuffer[idx] = accumulated;

// Average
Expand All @@ -303,6 +304,9 @@ __global__ void PostProcessKernel(
accumulated.w * invFrames
);

// Write averaged HDR for denoising (before clamp/RGBA conversion)
denoiseBuffer[idx] = make_float4(averaged.x, averaged.y, averaged.z, 0.0f);

// Clamp to [0, 1]
averaged.x = fminf(fmaxf(averaged.x, 0.0f), 1.0f);
averaged.y = fminf(fmaxf(averaged.y, 0.0f), 1.0f);
Expand All @@ -317,6 +321,33 @@ __global__ void PostProcessKernel(
outputImage[idx] = (a << 24) | (b << 16) | (g << 8) | r;
}

// ──────────────────────────────────────────────
// Convert denoised float4 → RGBA8 (used after OptiX denoise pass)
// ──────────────────────────────────────────────

__launch_bounds__(256, 4)
__global__ void ConvertToRGBAKernel(
const float4* __restrict__ inputBuffer,
uint32_t* __restrict__ outputImage,
uint32_t pixelCount)
{
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= pixelCount)
return;

float4 color = inputBuffer[idx];
color.x = fminf(fmaxf(color.x, 0.0f), 1.0f);
color.y = fminf(fmaxf(color.y, 0.0f), 1.0f);
color.z = fminf(fmaxf(color.z, 0.0f), 1.0f);

uint8_t r = static_cast<uint8_t>(color.x * 255.0f);
uint8_t g = static_cast<uint8_t>(color.y * 255.0f);
uint8_t b = static_cast<uint8_t>(color.z * 255.0f);
uint8_t a = 255;

outputImage[idx] = (a << 24) | (b << 16) | (g << 8) | r;
}

// ──────────────────────────────────────────────
// Clear accumulation buffer to zero
// ──────────────────────────────────────────────
Expand Down
12 changes: 12 additions & 0 deletions RayTracing/src/CUDARenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ void CUDARenderer_GetOutput(
CUDARenderState* state,
void* hostOutput, uint32_t byteSize);

// Get denoise buffer pointer (float4* HDR, for OptiX denoiser)
void* CUDARenderer_GetDenoiseBuffer(CUDARenderState* state);

// Convert denoised float4 buffer → RGBA8 output (GPU-side)
void CUDARenderer_ConvertDenoisedToRGBA(CUDARenderState* state, void* stream);

// Get compute stream for OptiX denoiser
void* CUDARenderer_GetComputeStream(CUDARenderState* state);

// Settings
void CUDARenderer_SetSettings(
CUDARenderState* state,
Expand All @@ -69,10 +78,13 @@ void CUDARenderer_DebugFill(CUDARenderState* state);
#include <cstdint>

// C++-compatible float3 (matches CUDA float3 layout: 12 bytes, alignment 4)
// Only define when CUDA's float3 from cuda_runtime.h is not already available
#if !defined(CUDART_VERSION)
struct float3
{
float x, y, z;
};
#endif

// Memory layout must match GPUSphere in CUDATypes.cuh
// Compile-time enforcement via static_assert in CUDATypes.cuh
Expand Down
181 changes: 181 additions & 0 deletions RayTracing/src/OptiXDenoiser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#ifdef WL_OPTIX

#include "OptiXDenoiser.h"

// Define the global OptiX function table (required by optix_stubs.h)
// Must be defined in exactly one translation unit.
OptixFunctionTable g_optixFunctionTable_118 = {};

#include <cstdio>

// ── Error-checking helper ──

#define OPTIX_CHECK(call) \
do { \
OptixResult res = (call); \
if (res != OPTIX_SUCCESS) { \
std::fprintf(stderr, \
"[OptiX] Error at %s:%d - %s (code: %d)\n", \
__FILE__, __LINE__, optixGetErrorString(res), \
static_cast<int>(res)); \
} \
} while (0)

// ── RAII Lifecycle ──

OptiXDenoiser::OptiXDenoiser()
{
OptixResult result = optixInit();
if (result != OPTIX_SUCCESS)
{
std::fprintf(stderr, "[OptiX] optixInit failed: %s\n",
optixGetErrorString(result));
return;
}

// Retrieve current CUDA context (must already exist)
CUcontext cuCtx = nullptr;
CUresult cuRes = cuCtxGetCurrent(&cuCtx);
if (cuRes != CUDA_SUCCESS || cuCtx == nullptr)
{
CUdevice cuDevice;
cuCtxGetDevice(&cuDevice);
cuRes = cuDevicePrimaryCtxRetain(&cuCtx, cuDevice);
if (cuRes != CUDA_SUCCESS)
{
std::fprintf(stderr, "[OptiX] Failed to get CUDA context\n");
return;
}
}
Comment on lines +36 to +49

OptixDeviceContextOptions ctxOpts = {};
result = optixDeviceContextCreate(cuCtx, &ctxOpts, &m_optixContext);
if (result != OPTIX_SUCCESS)
{
std::fprintf(stderr, "[OptiX] optixDeviceContextCreate failed: %s\n",
optixGetErrorString(result));
return;
}

std::printf("[OptiX] Context created successfully\n");
}

OptiXDenoiser::~OptiXDenoiser()
{
Cleanup();
if (m_optixContext)
optixDeviceContextDestroy(m_optixContext);
}

// ── Initialize / Resize ──

bool OptiXDenoiser::Initialize(uint32_t width, uint32_t height, cudaStream_t stream)
{
if (!m_optixContext || width == 0 || height == 0)
return false;

Cleanup(); // Free previous denoiser resources

m_width = width;
m_height = height;

OptixDenoiserOptions opts = {};
opts.guideAlbedo = 0;
opts.guideNormal = 0;
opts.denoiseAlpha = OPTIX_DENOISER_ALPHA_MODE_COPY;

OPTIX_CHECK(optixDenoiserCreate(
m_optixContext, OPTIX_DENOISER_MODEL_KIND_HDR,
&opts, &m_denoiser));

if (!m_denoiser) return false;

OPTIX_CHECK(optixDenoiserComputeMemoryResources(
m_denoiser, width, height, &m_sizes));

cudaMalloc(&m_dStateBuffer, m_sizes.stateSizeInBytes);
cudaMalloc(&m_dScratchBuffer, m_sizes.withoutOverlapScratchSizeInBytes);
cudaMalloc(&m_dHdrIntensity, sizeof(float));

OPTIX_CHECK(optixDenoiserSetup(
m_denoiser, (CUstream)stream,
width, height,
(CUdeviceptr)m_dStateBuffer, m_sizes.stateSizeInBytes,
(CUdeviceptr)m_dScratchBuffer, m_sizes.withoutOverlapScratchSizeInBytes));

m_valid = true;
Comment on lines +87 to +106

std::printf("[OptiX] Denoiser ready (%ux%u, %.1f MB scratch)\n",
width, height,
static_cast<float>(m_sizes.withoutOverlapScratchSizeInBytes
+ m_sizes.stateSizeInBytes
+ sizeof(float)) / (1024.0f * 1024.0f));
return true;
}

void OptiXDenoiser::Resize(uint32_t width, uint32_t height, cudaStream_t stream)
{
if (width == m_width && height == m_height && m_valid)
return;
Initialize(width, height, stream);
}

// ── Per-Frame Denoise ──

void OptiXDenoiser::Denoise(float4* d_input, float4* d_output,
uint32_t width, uint32_t height,
cudaStream_t stream)
{
if (!m_valid || !m_denoiser || width != m_width || height != m_height)
return;

OptixImage2D inputLayer = {};
inputLayer.data = (CUdeviceptr)d_input;
inputLayer.width = width;
inputLayer.height = height;
inputLayer.rowStrideInBytes = width * sizeof(float4);
inputLayer.pixelStrideInBytes = sizeof(float4);
inputLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;

OptixImage2D outputLayer = inputLayer;
outputLayer.data = (CUdeviceptr)d_output;

// Compute HDR intensity for brightness normalization
OPTIX_CHECK(optixDenoiserComputeIntensity(
m_denoiser, (CUstream)stream,
&inputLayer,
(CUdeviceptr)m_dHdrIntensity,
(CUdeviceptr)m_dScratchBuffer,
m_sizes.withoutOverlapScratchSizeInBytes));

OptixDenoiserParams params = {};
params.blendFactor = 0.0f;
params.hdrIntensity = (CUdeviceptr)m_dHdrIntensity;

OptixDenoiserGuideLayer guideLayer = {};
OptixDenoiserLayer layer = {};
layer.input = inputLayer;
layer.output = outputLayer;

OPTIX_CHECK(optixDenoiserInvoke(
m_denoiser, (CUstream)stream,
&params,
(CUdeviceptr)m_dStateBuffer, m_sizes.stateSizeInBytes,
&guideLayer, &layer, 1,
0, 0,
(CUdeviceptr)m_dScratchBuffer,
m_sizes.withoutOverlapScratchSizeInBytes));
}

// ── Cleanup ──

void OptiXDenoiser::Cleanup()
{
if (m_dHdrIntensity) { cudaFree(m_dHdrIntensity); m_dHdrIntensity = nullptr; }
if (m_dScratchBuffer) { cudaFree(m_dScratchBuffer); m_dScratchBuffer = nullptr; }
if (m_dStateBuffer) { cudaFree(m_dStateBuffer); m_dStateBuffer = nullptr; }
if (m_denoiser) { optixDenoiserDestroy(m_denoiser); m_denoiser = nullptr; }
m_valid = false;
}

#endif // WL_OPTIX
Loading
Loading