Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "Walnut"]
path = Walnut
url = https://github.com/TheCherno/Walnut
url = https://github.com/Cle2ment/Walnut.git
22 changes: 19 additions & 3 deletions RayTracing/src/CUDARenderer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ struct CUDARenderState
float4* d_SampleBuffer; // Device per-pixel sample buffer (raw path trace output)
float4* d_DenoiseBuffer; // Device per-pixel denoise buffer (averaged HDR, for OptiX)
float4* d_AccumulationBuffer; // Device accumulation buffer
uint32_t* d_OutputImage; // Device output RGBA buffer
uint32_t* d_OutputImage; // Device output RGBA buffer (legacy, null if interop)
void* d_InteropBuffer; // External device buffer (Vulkan-CUDA interop, nullptr if disabled)

GPUSphere* d_Spheres; // Device scene spheres
GPUMaterial* d_Materials; // Device scene materials
Expand Down Expand Up @@ -50,6 +51,7 @@ CUDARenderState* CUDARenderer_Create()
state->d_DenoiseBuffer = nullptr;
state->d_AccumulationBuffer = nullptr;
state->d_OutputImage = nullptr;
state->d_InteropBuffer = nullptr;
state->d_Spheres = nullptr;
state->d_Materials = nullptr;
state->d_RayDirections = nullptr;
Expand Down Expand Up @@ -278,7 +280,7 @@ void CUDARenderer_Render(
state->d_SampleBuffer,
state->d_AccumulationBuffer,
state->d_DenoiseBuffer,
state->d_OutputImage,
state->d_InteropBuffer ? (uint32_t*)state->d_InteropBuffer : state->d_OutputImage,
frameIndex,
state->pixelCount
);
Expand Down Expand Up @@ -349,14 +351,28 @@ void CUDARenderer_DebugFill(CUDARenderState* state)
);

DebugFillKernel<<<gridDim, blockDim>>>(
state->d_OutputImage,
state->d_InteropBuffer ? (uint32_t*)state->d_InteropBuffer : state->d_OutputImage,
state->imageWidth,
state->imageHeight
);

cudaDeviceSynchronize();
}

// ─── Vulkan-CUDA Interop API ───

void CUDARenderer_SetOutputBuffer(CUDARenderState* state, void* devPtr)
{
if (!state) return;
state->d_InteropBuffer = devPtr;
}

void CUDARenderer_Sync(CUDARenderState* state)
{
if (!state || !state->initialized) return;
cudaStreamSynchronize(state->computeStream);
}

void CUDARenderer_CheckError(const char* file, int line)
{
cudaError_t err = cudaGetLastError();
Expand Down
8 changes: 7 additions & 1 deletion RayTracing/src/CUDARenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,17 @@ void CUDARenderer_SetCameraPosition(
// Render one frame (launches CUDA kernel)
void CUDARenderer_Render(CUDARenderState* state, uint32_t frameIndex);

// Download output image (device → host)
// Download output image (device → host) — legacy, skipped when interop active
void CUDARenderer_GetOutput(
CUDARenderState* state,
void* hostOutput, uint32_t byteSize);

// Vulkan-CUDA interop: set externally-allocated output buffer (null = use d_OutputImage)
void CUDARenderer_SetOutputBuffer(CUDARenderState* state, void* devPtr);

// Synchronize CUDA compute stream (replaces blocking GetOutput when interop active)
void CUDARenderer_Sync(CUDARenderState* state);

// Get denoise buffer pointer (float4* HDR, for OptiX denoiser)
void* CUDARenderer_GetDenoiseBuffer(CUDARenderState* state);

Expand Down
118 changes: 111 additions & 7 deletions RayTracing/src/Renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

#include "Walnut/Random.h"

#ifdef WL_CUDA
#include "Walnut/Application.h"
#endif

#include <cstring>
#include <execution>
#include <limits>
Expand Down Expand Up @@ -181,6 +185,15 @@ void Renderer::OnResize(uint32_t width, uint32_t height)
if (cudaInitialized)
{
CUDARenderer_OnResize(m_CUDAState, width, height);

// Recreate interop buffer on resize when enabled
if (m_InteropEnabled)
{
try { m_Interop = std::make_unique<VkCUDAInterop>(width, height); }
catch (...) { m_Interop.reset(); m_InteropEnabled = false; }
if (m_Interop)
CUDARenderer_SetOutputBuffer(m_CUDAState, m_Interop->GetCUDADevicePtr());
}
}
#endif

Expand Down Expand Up @@ -349,7 +362,66 @@ void Renderer::Render(const Scene& scene, const Camera& camera)
#endif // WL_ISPC
#endif // !WL_CUDA

m_FinalImage->SetData(m_ImageData);
#ifdef WL_CUDA
if (m_InteropEnabled && m_Interop)
{
// Interop path: CUDA wrote to Vulkan buffer, copy to Walnut's VkImage
VkCommandBuffer cmd = Walnut::Application::GetCommandBuffer(true);
VkImage dstImage = m_FinalImage->GetImage();

// Buffer barrier: external (CUDA) write → Vulkan transfer read
VkBufferMemoryBarrier bufBarrier = {};
bufBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
bufBarrier.srcAccessMask = 0;
bufBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
bufBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
bufBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
bufBarrier.buffer = m_Interop->GetVulkanBuffer();
bufBarrier.size = VK_WHOLE_SIZE;
vkCmdPipelineBarrier(cmd,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0, 0, nullptr, 1, &bufBarrier, 0, nullptr);

VkImageMemoryBarrier preBarrier = {};
preBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
preBarrier.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
preBarrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
preBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
preBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
preBarrier.image = dstImage;
preBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
preBarrier.srcAccessMask = 0;
preBarrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, 1, &preBarrier);

VkBufferImageCopy region = {};
region.imageSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 };
region.imageExtent = { m_Interop->GetWidth(), m_Interop->GetHeight(), 1 };
vkCmdCopyBufferToImage(cmd, m_Interop->GetVulkanBuffer(), dstImage,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);

VkImageMemoryBarrier postBarrier = {};
postBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
postBarrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
postBarrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
postBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
postBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
postBarrier.image = dstImage;
postBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
postBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
postBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &postBarrier);

Walnut::Application::FlushCommandBuffer(cmd);
}
else
#endif
{
m_FinalImage->SetData(m_ImageData);
}

if (m_Settings.Accumulate)
m_FrameIndex++;
Expand Down Expand Up @@ -573,6 +645,31 @@ void Renderer::RenderGPU(const Scene& scene, const Camera& camera)
const uint32_t height = m_FinalImage->GetHeight();
if (width == 0 || height == 0) return;

// Sync interop toggle from settings (requires re-creation on enable)
if (m_Settings.EnableInterop != m_InteropEnabled)
{
m_InteropEnabled = m_Settings.EnableInterop;
if (m_InteropEnabled)
{
try
{
m_Interop = std::make_unique<VkCUDAInterop>(width, height);
CUDARenderer_SetOutputBuffer(m_CUDAState, m_Interop->GetCUDADevicePtr());
}
catch (...)
{
m_Interop.reset();
m_InteropEnabled = false;
m_Settings.EnableInterop = false;
}
}
else
{
m_Interop.reset();
CUDARenderer_SetOutputBuffer(m_CUDAState, nullptr);
}
}

// Upload scene data only when changed (tracked by scene version)
if (scene.Version != m_LastSceneVersion)
{
Expand Down Expand Up @@ -631,12 +728,19 @@ void Renderer::RenderGPU(const Scene& scene, const Camera& camera)
}
#endif

// Download output image from GPU
CUDARenderer_GetOutput(
m_CUDAState,
m_ImageData,
width * height * sizeof(uint32_t)
);
// Download output image from GPU (skip D2H when interop writes directly to Vulkan)
if (m_InteropEnabled && m_Interop)
{
m_Interop->SyncCUDAComplete((cudaStream_t)CUDARenderer_GetComputeStream(m_CUDAState));
}
else
{
CUDARenderer_GetOutput(
m_CUDAState,
m_ImageData,
width * height * sizeof(uint32_t)
);
}
}

#endif // WL_CUDA
6 changes: 6 additions & 0 deletions RayTracing/src/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#endif

#ifdef WL_CUDA
#include "VkCUDAInterop.h" // Must precede CUDARenderer.h — defines CUDART_VERSION so float3 guard works
#include "CUDARenderer.h"
#endif

Expand All @@ -31,6 +32,9 @@ class Renderer
bool SlowRandom = true;
#ifdef WL_OPTIX
bool EnableDenoising = true;
#endif
#ifdef WL_CUDA
bool EnableInterop = false;
#endif
};

Expand Down Expand Up @@ -97,6 +101,8 @@ class Renderer

#ifdef WL_CUDA
CUDARenderState* m_CUDAState = nullptr;
std::unique_ptr<VkCUDAInterop> m_Interop;
bool m_InteropEnabled = false;
std::vector<GPUPackedSphere> m_GPUSpheres;
std::vector<GPUPackedMaterial> m_GPUMaterials;
std::vector<float3> m_GPURayDirs;
Expand Down
115 changes: 115 additions & 0 deletions RayTracing/src/VkCUDAInterop.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#include "VkCUDAInterop.h"
#include "Walnut/Application.h"
#include <stdexcept>
#include <cstring>

static uint32_t FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties)
{
VkPhysicalDeviceMemoryProperties memProps;
vkGetPhysicalDeviceMemoryProperties(Walnut::Application::GetPhysicalDevice(), &memProps);
for (uint32_t i = 0; i < memProps.memoryTypeCount; i++)
if ((typeFilter & (1 << i)) &&
(memProps.memoryTypes[i].propertyFlags & properties) == properties)
return i;
throw std::runtime_error("VkCUDAInterop: no suitable memory type");
}

VkCUDAInterop::VkCUDAInterop(uint32_t width, uint32_t height)
: m_Width(width), m_Height(height), m_Size(width * height * sizeof(uint32_t))
{
CreateVulkanBuffer();
ExportToCUDA();
}

VkCUDAInterop::~VkCUDAInterop()
{
VkDevice device = Walnut::Application::GetDevice();
if (m_CUDAExtMem) cudaDestroyExternalMemory(m_CUDAExtMem);
if (m_Memory) vkFreeMemory(device, m_Memory, nullptr);
if (m_Buffer) vkDestroyBuffer(device, m_Buffer, nullptr);
}

void VkCUDAInterop::CreateVulkanBuffer()
{
VkDevice device = Walnut::Application::GetDevice();

VkExternalMemoryBufferCreateInfo extInfo = {};
extInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO;
extInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;

VkBufferCreateInfo bufferInfo = {};
bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
bufferInfo.pNext = &extInfo;
bufferInfo.size = m_Size;
bufferInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

if (vkCreateBuffer(device, &bufferInfo, nullptr, &m_Buffer) != VK_SUCCESS)
throw std::runtime_error("VkCUDAInterop: vkCreateBuffer failed");

VkMemoryRequirements memReq;
vkGetBufferMemoryRequirements(device, m_Buffer, &memReq);

VkMemoryDedicatedAllocateInfo dedicatedInfo = {};
dedicatedInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
dedicatedInfo.buffer = m_Buffer;

VkExportMemoryAllocateInfo exportInfo = {};
exportInfo.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
exportInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
exportInfo.pNext = nullptr;
dedicatedInfo.pNext = &exportInfo;

VkMemoryAllocateInfo allocInfo = {};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.pNext = &dedicatedInfo;
allocInfo.allocationSize = memReq.size;
allocInfo.memoryTypeIndex = FindMemoryType(memReq.memoryTypeBits,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

if (vkAllocateMemory(device, &allocInfo, nullptr, &m_Memory) != VK_SUCCESS)
throw std::runtime_error("VkCUDAInterop: vkAllocateMemory failed");

vkBindBufferMemory(device, m_Buffer, m_Memory, 0);
}

void VkCUDAInterop::ExportToCUDA()
{
VkDevice device = Walnut::Application::GetDevice();

auto vkGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)
vkGetDeviceProcAddr(device, "vkGetMemoryWin32HandleKHR");
if (!vkGetMemoryWin32HandleKHR)
throw std::runtime_error("VkCUDAInterop: vkGetMemoryWin32HandleKHR not available");

VkMemoryGetWin32HandleInfoKHR handleInfo = {};
handleInfo.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
handleInfo.memory = m_Memory;
handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;

HANDLE handle = nullptr;
if (vkGetMemoryWin32HandleKHR(device, &handleInfo, &handle) != VK_SUCCESS)
throw std::runtime_error("VkCUDAInterop: vkGetMemoryWin32HandleKHR failed");

cudaExternalMemoryHandleDesc extMemDesc = {};
extMemDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
extMemDesc.handle.win32.handle = handle;
extMemDesc.size = m_Size;
extMemDesc.flags = cudaExternalMemoryDedicated;

cudaError_t err = cudaImportExternalMemory(&m_CUDAExtMem, &extMemDesc);
CloseHandle(handle);
if (err != cudaSuccess)
throw std::runtime_error(std::string("VkCUDAInterop: cudaImportExternalMemory failed: ") + cudaGetErrorString(err));

cudaExternalMemoryBufferDesc bufDesc = {};
bufDesc.size = m_Size;

err = cudaExternalMemoryGetMappedBuffer(&m_CUDADevPtr, m_CUDAExtMem, &bufDesc);
if (err != cudaSuccess)
throw std::runtime_error(std::string("VkCUDAInterop: cudaExternalMemoryGetMappedBuffer failed: ") + cudaGetErrorString(err));
}

void VkCUDAInterop::SyncCUDAComplete(cudaStream_t stream)
{
cudaStreamSynchronize(stream);
}
Loading
Loading