diff --git a/.gitmodules b/.gitmodules index 3bfad55..aaa91ae 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "Walnut"] path = Walnut - url = https://github.com/TheCherno/Walnut + url = https://github.com/Cle2ment/Walnut.git diff --git a/RayTracing/src/CUDARenderer.cu b/RayTracing/src/CUDARenderer.cu index 9a656cf..72e2160 100644 --- a/RayTracing/src/CUDARenderer.cu +++ b/RayTracing/src/CUDARenderer.cu @@ -17,7 +17,8 @@ struct CUDARenderState float4* d_SampleBuffer; // Device per-pixel sample buffer (raw path trace output) float4* d_DenoiseBuffer; // Device per-pixel denoise buffer (averaged HDR, for OptiX) float4* d_AccumulationBuffer; // Device accumulation buffer - uint32_t* d_OutputImage; // Device output RGBA buffer + uint32_t* d_OutputImage; // Device output RGBA buffer (legacy, null if interop) + void* d_InteropBuffer; // External device buffer (Vulkan-CUDA interop, nullptr if disabled) GPUSphere* d_Spheres; // Device scene spheres GPUMaterial* d_Materials; // Device scene materials @@ -50,6 +51,7 @@ CUDARenderState* CUDARenderer_Create() state->d_DenoiseBuffer = nullptr; state->d_AccumulationBuffer = nullptr; state->d_OutputImage = nullptr; + state->d_InteropBuffer = nullptr; state->d_Spheres = nullptr; state->d_Materials = nullptr; state->d_RayDirections = nullptr; @@ -278,7 +280,7 @@ void CUDARenderer_Render( state->d_SampleBuffer, state->d_AccumulationBuffer, state->d_DenoiseBuffer, - state->d_OutputImage, + state->d_InteropBuffer ? (uint32_t*)state->d_InteropBuffer : state->d_OutputImage, frameIndex, state->pixelCount ); @@ -349,7 +351,7 @@ void CUDARenderer_DebugFill(CUDARenderState* state) ); DebugFillKernel<<>>( - state->d_OutputImage, + state->d_InteropBuffer ? (uint32_t*)state->d_InteropBuffer : state->d_OutputImage, state->imageWidth, state->imageHeight ); @@ -357,6 +359,20 @@ void CUDARenderer_DebugFill(CUDARenderState* state) cudaDeviceSynchronize(); } +// ─── Vulkan-CUDA Interop API ─── + +void CUDARenderer_SetOutputBuffer(CUDARenderState* state, void* devPtr) +{ + if (!state) return; + state->d_InteropBuffer = devPtr; +} + +void CUDARenderer_Sync(CUDARenderState* state) +{ + if (!state || !state->initialized) return; + cudaStreamSynchronize(state->computeStream); +} + void CUDARenderer_CheckError(const char* file, int line) { cudaError_t err = cudaGetLastError(); diff --git a/RayTracing/src/CUDARenderer.h b/RayTracing/src/CUDARenderer.h index e35d237..73df838 100644 --- a/RayTracing/src/CUDARenderer.h +++ b/RayTracing/src/CUDARenderer.h @@ -42,11 +42,17 @@ void CUDARenderer_SetCameraPosition( // Render one frame (launches CUDA kernel) void CUDARenderer_Render(CUDARenderState* state, uint32_t frameIndex); -// Download output image (device → host) +// Download output image (device → host) — legacy, skipped when interop active void CUDARenderer_GetOutput( CUDARenderState* state, void* hostOutput, uint32_t byteSize); +// Vulkan-CUDA interop: set externally-allocated output buffer (null = use d_OutputImage) +void CUDARenderer_SetOutputBuffer(CUDARenderState* state, void* devPtr); + +// Synchronize CUDA compute stream (replaces blocking GetOutput when interop active) +void CUDARenderer_Sync(CUDARenderState* state); + // Get denoise buffer pointer (float4* HDR, for OptiX denoiser) void* CUDARenderer_GetDenoiseBuffer(CUDARenderState* state); diff --git a/RayTracing/src/Renderer.cpp b/RayTracing/src/Renderer.cpp index 7067a20..a75f315 100644 --- a/RayTracing/src/Renderer.cpp +++ b/RayTracing/src/Renderer.cpp @@ -2,6 +2,10 @@ #include "Walnut/Random.h" +#ifdef WL_CUDA +#include "Walnut/Application.h" +#endif + #include #include #include @@ -181,6 +185,15 @@ void Renderer::OnResize(uint32_t width, uint32_t height) if (cudaInitialized) { CUDARenderer_OnResize(m_CUDAState, width, height); + + // Recreate interop buffer on resize when enabled + if (m_InteropEnabled) + { + try { m_Interop = std::make_unique(width, height); } + catch (...) { m_Interop.reset(); m_InteropEnabled = false; } + if (m_Interop) + CUDARenderer_SetOutputBuffer(m_CUDAState, m_Interop->GetCUDADevicePtr()); + } } #endif @@ -349,7 +362,66 @@ void Renderer::Render(const Scene& scene, const Camera& camera) #endif // WL_ISPC #endif // !WL_CUDA - m_FinalImage->SetData(m_ImageData); +#ifdef WL_CUDA + if (m_InteropEnabled && m_Interop) + { + // Interop path: CUDA wrote to Vulkan buffer, copy to Walnut's VkImage + VkCommandBuffer cmd = Walnut::Application::GetCommandBuffer(true); + VkImage dstImage = m_FinalImage->GetImage(); + + // Buffer barrier: external (CUDA) write → Vulkan transfer read + VkBufferMemoryBarrier bufBarrier = {}; + bufBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bufBarrier.srcAccessMask = 0; + bufBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + bufBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufBarrier.buffer = m_Interop->GetVulkanBuffer(); + bufBarrier.size = VK_WHOLE_SIZE; + vkCmdPipelineBarrier(cmd, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, 0, nullptr, 1, &bufBarrier, 0, nullptr); + + VkImageMemoryBarrier preBarrier = {}; + preBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + preBarrier.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + preBarrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + preBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + preBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + preBarrier.image = dstImage; + preBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }; + preBarrier.srcAccessMask = 0; + preBarrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, 1, &preBarrier); + + VkBufferImageCopy region = {}; + region.imageSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 }; + region.imageExtent = { m_Interop->GetWidth(), m_Interop->GetHeight(), 1 }; + vkCmdCopyBufferToImage(cmd, m_Interop->GetVulkanBuffer(), dstImage, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); + + VkImageMemoryBarrier postBarrier = {}; + postBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + postBarrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + postBarrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + postBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + postBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + postBarrier.image = dstImage; + postBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }; + postBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + postBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &postBarrier); + + Walnut::Application::FlushCommandBuffer(cmd); + } + else +#endif + { + m_FinalImage->SetData(m_ImageData); + } if (m_Settings.Accumulate) m_FrameIndex++; @@ -573,6 +645,31 @@ void Renderer::RenderGPU(const Scene& scene, const Camera& camera) const uint32_t height = m_FinalImage->GetHeight(); if (width == 0 || height == 0) return; + // Sync interop toggle from settings (requires re-creation on enable) + if (m_Settings.EnableInterop != m_InteropEnabled) + { + m_InteropEnabled = m_Settings.EnableInterop; + if (m_InteropEnabled) + { + try + { + m_Interop = std::make_unique(width, height); + CUDARenderer_SetOutputBuffer(m_CUDAState, m_Interop->GetCUDADevicePtr()); + } + catch (...) + { + m_Interop.reset(); + m_InteropEnabled = false; + m_Settings.EnableInterop = false; + } + } + else + { + m_Interop.reset(); + CUDARenderer_SetOutputBuffer(m_CUDAState, nullptr); + } + } + // Upload scene data only when changed (tracked by scene version) if (scene.Version != m_LastSceneVersion) { @@ -631,12 +728,19 @@ void Renderer::RenderGPU(const Scene& scene, const Camera& camera) } #endif - // Download output image from GPU - CUDARenderer_GetOutput( - m_CUDAState, - m_ImageData, - width * height * sizeof(uint32_t) - ); + // Download output image from GPU (skip D2H when interop writes directly to Vulkan) + if (m_InteropEnabled && m_Interop) + { + m_Interop->SyncCUDAComplete((cudaStream_t)CUDARenderer_GetComputeStream(m_CUDAState)); + } + else + { + CUDARenderer_GetOutput( + m_CUDAState, + m_ImageData, + width * height * sizeof(uint32_t) + ); + } } #endif // WL_CUDA diff --git a/RayTracing/src/Renderer.h b/RayTracing/src/Renderer.h index e6d075b..8c5b869 100644 --- a/RayTracing/src/Renderer.h +++ b/RayTracing/src/Renderer.h @@ -11,6 +11,7 @@ #endif #ifdef WL_CUDA +#include "VkCUDAInterop.h" // Must precede CUDARenderer.h — defines CUDART_VERSION so float3 guard works #include "CUDARenderer.h" #endif @@ -31,6 +32,9 @@ class Renderer bool SlowRandom = true; #ifdef WL_OPTIX bool EnableDenoising = true; +#endif +#ifdef WL_CUDA + bool EnableInterop = false; #endif }; @@ -97,6 +101,8 @@ class Renderer #ifdef WL_CUDA CUDARenderState* m_CUDAState = nullptr; + std::unique_ptr m_Interop; + bool m_InteropEnabled = false; std::vector m_GPUSpheres; std::vector m_GPUMaterials; std::vector m_GPURayDirs; diff --git a/RayTracing/src/VkCUDAInterop.cpp b/RayTracing/src/VkCUDAInterop.cpp new file mode 100644 index 0000000..89edd45 --- /dev/null +++ b/RayTracing/src/VkCUDAInterop.cpp @@ -0,0 +1,115 @@ +#include "VkCUDAInterop.h" +#include "Walnut/Application.h" +#include +#include + +static uint32_t FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) +{ + VkPhysicalDeviceMemoryProperties memProps; + vkGetPhysicalDeviceMemoryProperties(Walnut::Application::GetPhysicalDevice(), &memProps); + for (uint32_t i = 0; i < memProps.memoryTypeCount; i++) + if ((typeFilter & (1 << i)) && + (memProps.memoryTypes[i].propertyFlags & properties) == properties) + return i; + throw std::runtime_error("VkCUDAInterop: no suitable memory type"); +} + +VkCUDAInterop::VkCUDAInterop(uint32_t width, uint32_t height) + : m_Width(width), m_Height(height), m_Size(width * height * sizeof(uint32_t)) +{ + CreateVulkanBuffer(); + ExportToCUDA(); +} + +VkCUDAInterop::~VkCUDAInterop() +{ + VkDevice device = Walnut::Application::GetDevice(); + if (m_CUDAExtMem) cudaDestroyExternalMemory(m_CUDAExtMem); + if (m_Memory) vkFreeMemory(device, m_Memory, nullptr); + if (m_Buffer) vkDestroyBuffer(device, m_Buffer, nullptr); +} + +void VkCUDAInterop::CreateVulkanBuffer() +{ + VkDevice device = Walnut::Application::GetDevice(); + + VkExternalMemoryBufferCreateInfo extInfo = {}; + extInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; + extInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.pNext = &extInfo; + bufferInfo.size = m_Size; + bufferInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + + if (vkCreateBuffer(device, &bufferInfo, nullptr, &m_Buffer) != VK_SUCCESS) + throw std::runtime_error("VkCUDAInterop: vkCreateBuffer failed"); + + VkMemoryRequirements memReq; + vkGetBufferMemoryRequirements(device, m_Buffer, &memReq); + + VkMemoryDedicatedAllocateInfo dedicatedInfo = {}; + dedicatedInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO; + dedicatedInfo.buffer = m_Buffer; + + VkExportMemoryAllocateInfo exportInfo = {}; + exportInfo.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO; + exportInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + exportInfo.pNext = nullptr; + dedicatedInfo.pNext = &exportInfo; + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.pNext = &dedicatedInfo; + allocInfo.allocationSize = memReq.size; + allocInfo.memoryTypeIndex = FindMemoryType(memReq.memoryTypeBits, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &m_Memory) != VK_SUCCESS) + throw std::runtime_error("VkCUDAInterop: vkAllocateMemory failed"); + + vkBindBufferMemory(device, m_Buffer, m_Memory, 0); +} + +void VkCUDAInterop::ExportToCUDA() +{ + VkDevice device = Walnut::Application::GetDevice(); + + auto vkGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR) + vkGetDeviceProcAddr(device, "vkGetMemoryWin32HandleKHR"); + if (!vkGetMemoryWin32HandleKHR) + throw std::runtime_error("VkCUDAInterop: vkGetMemoryWin32HandleKHR not available"); + + VkMemoryGetWin32HandleInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + handleInfo.memory = m_Memory; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + + HANDLE handle = nullptr; + if (vkGetMemoryWin32HandleKHR(device, &handleInfo, &handle) != VK_SUCCESS) + throw std::runtime_error("VkCUDAInterop: vkGetMemoryWin32HandleKHR failed"); + + cudaExternalMemoryHandleDesc extMemDesc = {}; + extMemDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; + extMemDesc.handle.win32.handle = handle; + extMemDesc.size = m_Size; + extMemDesc.flags = cudaExternalMemoryDedicated; + + cudaError_t err = cudaImportExternalMemory(&m_CUDAExtMem, &extMemDesc); + CloseHandle(handle); + if (err != cudaSuccess) + throw std::runtime_error(std::string("VkCUDAInterop: cudaImportExternalMemory failed: ") + cudaGetErrorString(err)); + + cudaExternalMemoryBufferDesc bufDesc = {}; + bufDesc.size = m_Size; + + err = cudaExternalMemoryGetMappedBuffer(&m_CUDADevPtr, m_CUDAExtMem, &bufDesc); + if (err != cudaSuccess) + throw std::runtime_error(std::string("VkCUDAInterop: cudaExternalMemoryGetMappedBuffer failed: ") + cudaGetErrorString(err)); +} + +void VkCUDAInterop::SyncCUDAComplete(cudaStream_t stream) +{ + cudaStreamSynchronize(stream); +} diff --git a/RayTracing/src/VkCUDAInterop.h b/RayTracing/src/VkCUDAInterop.h new file mode 100644 index 0000000..16f9e82 --- /dev/null +++ b/RayTracing/src/VkCUDAInterop.h @@ -0,0 +1,33 @@ +#pragma once + +#define VK_USE_PLATFORM_WIN32_KHR +#include +#include +#include + +class VkCUDAInterop +{ +public: + VkCUDAInterop(uint32_t width, uint32_t height); + ~VkCUDAInterop(); + + void* GetCUDADevicePtr() const { return m_CUDADevPtr; } + VkBuffer GetVulkanBuffer() const { return m_Buffer; } + uint32_t GetWidth() const { return m_Width; } + uint32_t GetHeight() const { return m_Height; } + + void SyncCUDAComplete(cudaStream_t stream); + +private: + void CreateVulkanBuffer(); + void ExportToCUDA(); + + uint32_t m_Width, m_Height; + VkDeviceSize m_Size; + + VkBuffer m_Buffer = VK_NULL_HANDLE; + VkDeviceMemory m_Memory = VK_NULL_HANDLE; + + cudaExternalMemory_t m_CUDAExtMem = nullptr; + void* m_CUDADevPtr = nullptr; +}; diff --git a/RayTracing/src/WalnutApp.cpp b/RayTracing/src/WalnutApp.cpp index 95ab374..fac80b9 100644 --- a/RayTracing/src/WalnutApp.cpp +++ b/RayTracing/src/WalnutApp.cpp @@ -104,6 +104,10 @@ class ExampleLayer final : public Walnut::Layer if (ImGui::Checkbox("Denoise", &m_Renderer.GetSettings().EnableDenoising)) m_NeedsRender = true; #endif +#ifdef WL_CUDA + if (ImGui::Checkbox("Vulkan-CUDA Interop", &m_Renderer.GetSettings().EnableInterop)) + m_NeedsRender = true; +#endif if (ImGui::Button("Reset")) { diff --git a/Walnut b/Walnut index 3b8e414..12814d1 160000 --- a/Walnut +++ b/Walnut @@ -1 +1 @@ -Subproject commit 3b8e414fdecfc6c8b58816106fe8d912bd172e31 +Subproject commit 12814d1f7bb19d5752609eb315762b4e2bc6ff2c