From d293b9b85589ba6e483bc2e7bdabe9c563793190 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 8 Jul 2023 18:29:58 +0300 Subject: [PATCH 01/62] create exportable buffers to import into cuda --- include/nbl/video/CCUDAHandler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..fe1ba28204 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -119,6 +119,7 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} From f5f1017b876c9c8beb5c1c2fa43749977c68a2ba Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 16:10:04 +0300 Subject: [PATCH 02/62] add missing cuda fn and update submodule --- examples_tests | 2 +- include/nbl/video/CCUDADevice.h | 2 +- include/nbl/video/CCUDAHandler.h | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index 31f501f9b1..faddda46b2 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 31f501f9b1624457eaf4a71eececa1fb67172ca3 +Subproject commit faddda46b285b433c2926b384064bd80a6889b43 diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 1120224fdb..ceb8ec3a8d 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -179,7 +179,7 @@ class CCUDADevice : public core::IReferenceCounted static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); #endif - + CUdevice getInternalObject() const { return m_handle; } protected: friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index fe1ba28204..fb3d52fc0f 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -120,6 +120,8 @@ class CCUDAHandler : public core::IReferenceCounted ,cuTexObjectCreate ,cuTexObjectDestroy ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} From 6689b335623771a309904de44f27556476c91978 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 20:56:00 +0300 Subject: [PATCH 03/62] add missing cuda export functions --- include/nbl/video/CCUDAHandler.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index fb3d52fc0f..838c527567 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -122,6 +122,15 @@ class CCUDAHandler : public core::IReferenceCounted ,cuImportExternalMemory ,cuDestroyExternalMemory ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} From 9ade1c66917d1c34d072623e62c50c2dfb6f3b75 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 23:34:16 +0300 Subject: [PATCH 04/62] move boilerplates to CCUDADevice --- include/nbl/video/CCUDADevice.h | 31 +++++ include/nbl/video/CCUDAHandler.h | 1 + include/nbl/video/IDeviceMemoryBacked.h | 25 ++++ src/nbl/video/CCUDADevice.cpp | 145 ++++++++++++++++++++++++ 4 files changed, 202 insertions(+) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index ceb8ec3a8d..fd39f8ec57 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -180,7 +180,38 @@ class CCUDADevice : public core::IReferenceCounted static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); #endif CUdevice getInternalObject() const { return m_handle; } + const CCUDAHandler* getHandler() const { return m_handler.get(); } + + struct SSharedCUDAMemory + { + size_t size; + CUdeviceptr ptr; + CUmemGenericAllocationHandle memory; + void* osHandle; + }; + + core::smart_refctd_ptr exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device); + + CUresult importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr); + CUresult createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem); + CUresult releaseExportableMemory(SSharedCUDAMemory mem); protected: + + struct SCUDACleaner : video::ICleanup, SSharedCUDAMemory + { + core::smart_refctd_ptr dev; + SCUDACleaner(SSharedCUDAMemory mem, core::smart_refctd_ptr&& dev) + : SSharedCUDAMemory{ mem } + , dev(std::move(dev)) + { } + + ~SCUDACleaner() + { + dev->releaseExportableMemory(*this); + } + }; + + CUresult reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr); friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); ~CCUDADevice() = default; diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 838c527567..5341563ea0 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -131,6 +131,7 @@ class CCUDAHandler : public core::IReferenceCounted ,cuMemMap ,cuMemRelease ,cuMemSetAccess + ,cuMemImportFromShareableHandle ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index f2b449557c..24b9b79439 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -93,6 +93,31 @@ class IDeviceMemoryBacked : public IBackendObject const uint32_t* queueFamilyIndices = nullptr; }; + void chainPreDestroyCleanup(std::unique_ptr next) + { + if (!m_cachedCreationParams.preDestroyCleanup) + { + m_cachedCreationParams.preDestroyCleanup = std::move(next); + return; + } + + struct SChainedCleanup : ICleanup + { + std::unique_ptr first, next; + SChainedCleanup(std::unique_ptr&& first, std::unique_ptr&& next) + : first(std::move(first)) + , next(std::move(next)) + { } + ~SChainedCleanup() + { + first = nullptr; + next = nullptr; + } + }; + + m_cachedCreationParams.preDestroyCleanup = std::make_unique(std::move(m_cachedCreationParams.preDestroyCleanup), std::move(next)); + } + protected: inline IDeviceMemoryBacked(core::smart_refctd_ptr&& originDevice, SCreationParams&& creationParams, const SDeviceMemoryRequirements& reqs) : IBackendObject(std::move(originDevice)), m_cachedCreationParams(std::move(creationParams)), m_cachedMemoryReqs(reqs) {} diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 4d2e880095..565621b00a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -17,6 +17,151 @@ CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConn } +CUresult CCUDADevice::reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr) +{ + auto& cu = m_handler->getCUDAFunctionTable(); + + CUdeviceptr ptr = 0; + if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + { + return err; + } + + if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) + { + cu.pcuMemAddressFree(ptr, size); + return err; + } + + CUmemAccessDesc accessDesc = { + .location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle }, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; + + if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) + { + cu.pcuMemUnmap(ptr, size); + cu.pcuMemAddressFree(ptr, size); + return err; + } + + *outPtr = ptr; + + return CUDA_SUCCESS; +} + +CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory mem) +{ + auto& cu = m_handler->getCUDAFunctionTable(); + if (auto err = cu.pcuMemUnmap(mem.ptr, mem.size); CUDA_SUCCESS != err) return err; + if (auto err = cu.pcuMemAddressFree(mem.ptr, mem.size); CUDA_SUCCESS != err) return err; + if (auto err = cu.pcuMemRelease(mem.memory); CUDA_SUCCESS != err) return err; + CloseHandle(mem.osHandle); +} + +CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem) +{ + if (!outMem) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + + uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32, + .location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle }, + .win32HandleMetaData = metaData, + }; + + size_t granularity = 0; + if (auto err = cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); CUDA_SUCCESS != err) + return err; + + size = ((size - 1) / granularity + 1) * granularity; + + CUmemGenericAllocationHandle mem = 0; + void* handle = 0; + CUdeviceptr ptr = 0; + + if(auto err = cu.pcuMemCreate(&mem, size, &prop, 0); CUDA_SUCCESS != err) + return err; + + if (auto err = cu.pcuMemExportToShareableHandle(&handle, mem, CU_MEM_HANDLE_TYPE_WIN32, 0); CUDA_SUCCESS != err) + { + cu.pcuMemRelease(mem); + return err; + } + + if (auto err = reserveAdrressAndMapMemory(size, alignment, mem, &ptr); CUDA_SUCCESS != err) + { + CloseHandle(handle); + cu.pcuMemRelease(mem); + return err; + } + + outMem->size = size; + outMem->memory = mem; + outMem->ptr = ptr; + outMem->osHandle = handle; + return CUDA_SUCCESS; +} + +core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device) +{ + auto buf = device->createBuffer( + { {.size = mem.size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT }, + { {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}} }); + + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto allocation = device->allocate(req, buf.get()); + + if (!(allocation.memory && allocation.offset != ILogicalDevice::InvalidMemoryOffset)) + return nullptr; + + buf->chainPreDestroyCleanup(std::make_unique(mem, core::smart_refctd_ptr(this))); + return buf; +} + +CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr) +{ + auto& params = buf->getCachedCreationParams(); + + if (!params.externalMemoryHandType.value || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { + .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, + .handle = {.win32 = {.handle = buf->getExternalHandle()}}, + .size = buf->getMemoryReqs().size, + }; + + CUmemGenericAllocationHandle mem = 0; + CUdeviceptr ptr = 0; + void* handle = handleDesc.handle.win32.handle; + + auto& cu = m_handler->getCUDAFunctionTable(); + if (auto err = cu.pcuMemImportFromShareableHandle(&mem, buf->getExternalHandle(), + static_cast(params.externalMemoryHandType.value)); + CUDA_SUCCESS != err) + return err; + + if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1 << buf->getMemoryReqs().alignmentLog2, mem, &ptr)) + { + cu.pcuMemRelease(mem); + return err; + } + + outPtr->ptr = ptr; + outPtr->memory = mem; + outPtr->size = buf->getSize(); + outPtr->osHandle = handle; + + buf->chainPreDestroyCleanup(std::make_unique(*outPtr, core::smart_refctd_ptr(this))); + return CUDA_SUCCESS; +} + #if 0 CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) { From bfa7afc5d01357f0f20cf904ab345100ba00631f Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 15 Jul 2023 13:38:16 +0300 Subject: [PATCH 05/62] correct chained cleanup desctruction order --- include/nbl/video/IDeviceMemoryBacked.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index 24b9b79439..4d44131e7c 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -93,11 +93,11 @@ class IDeviceMemoryBacked : public IBackendObject const uint32_t* queueFamilyIndices = nullptr; }; - void chainPreDestroyCleanup(std::unique_ptr next) + void chainPreDestroyCleanup(std::unique_ptr first) { if (!m_cachedCreationParams.preDestroyCleanup) { - m_cachedCreationParams.preDestroyCleanup = std::move(next); + m_cachedCreationParams.preDestroyCleanup = std::move(first); return; } @@ -115,7 +115,7 @@ class IDeviceMemoryBacked : public IBackendObject } }; - m_cachedCreationParams.preDestroyCleanup = std::make_unique(std::move(m_cachedCreationParams.preDestroyCleanup), std::move(next)); + m_cachedCreationParams.preDestroyCleanup = std::make_unique(std::move(first), std::move(m_cachedCreationParams.preDestroyCleanup)); } protected: From ddb861edf0c32c02e2a00e3d0628c9f6b4b1938f Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 15 Jul 2023 13:59:32 +0300 Subject: [PATCH 06/62] add safety checks --- include/nbl/video/CCUDADevice.h | 2 +- include/nbl/video/IDeviceMemoryBacked.h | 2 +- src/nbl/video/CCUDADevice.cpp | 23 ++++++++++++++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index fd39f8ec57..183fb577be 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -205,7 +205,7 @@ class CCUDADevice : public core::IReferenceCounted , dev(std::move(dev)) { } - ~SCUDACleaner() + ~SCUDACleaner() override { dev->releaseExportableMemory(*this); } diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index 4d44131e7c..ef9bef6588 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -92,7 +92,7 @@ class IDeviceMemoryBacked : public IBackendObject { const uint32_t* queueFamilyIndices = nullptr; }; - + void chainPreDestroyCleanup(std::unique_ptr first) { if (!m_cachedCreationParams.preDestroyCleanup) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 565621b00a..cb6dc31728 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -109,9 +109,23 @@ CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSha core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device) { + + if (!device || !mem.memory || !mem.osHandle || !mem.ptr || !mem.size) + return nullptr; + + { + CUuuid id; + // TODO(Atil): Cache properties + if (CUDA_SUCCESS != m_handler->getCUDAFunctionTable().pcuDeviceGetUuid(&id, m_handle)) + return nullptr; + + if (memcmp(&id, device->getPhysicalDevice()->getProperties().deviceUUID, 16)) + return nullptr; + } + auto buf = device->createBuffer( { {.size = mem.size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT }, - { {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}} }); + { {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}}}); auto req = buf->getMemoryReqs(); req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); @@ -126,9 +140,12 @@ core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemor CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr) { + if (!buf || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + auto& params = buf->getCachedCreationParams(); - if (!params.externalMemoryHandType.value || !outPtr) + if (!params.externalMemoryHandType.value) return CUDA_ERROR_INVALID_VALUE; CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { @@ -147,7 +164,7 @@ CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr CUDA_SUCCESS != err) return err; - if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1 << buf->getMemoryReqs().alignmentLog2, mem, &ptr)) + if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem, &ptr)) { cu.pcuMemRelease(mem); return err; From f3803982fa6d4e102c9aa358fabccaf34c281c45 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 15 Jul 2023 23:20:29 +0300 Subject: [PATCH 07/62] semaphore interop --- include/nbl/video/CCUDADevice.h | 49 ++++++++--- include/nbl/video/CCUDAHandler.h | 4 + include/nbl/video/IDeviceMemoryBacked.h | 33 +++----- src/nbl/video/CCUDADevice.cpp | 106 +++++++++++++++++------- src/nbl/video/CVulkanBuffer.cpp | 2 +- 5 files changed, 130 insertions(+), 64 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 183fb577be..26005fcba3 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -182,36 +182,59 @@ class CCUDADevice : public core::IReferenceCounted CUdevice getInternalObject() const { return m_handle; } const CCUDAHandler* getHandler() const { return m_handler.get(); } - struct SSharedCUDAMemory + struct SSharedCUDAMemory : core::IReferenceCounted { + core::smart_refctd_ptr device; size_t size; CUdeviceptr ptr; CUmemGenericAllocationHandle memory; void* osHandle; + SSharedCUDAMemory(core::smart_refctd_ptr device, size_t size, CUdeviceptr ptr, CUmemGenericAllocationHandle memory, void* osHandle) + : device(std::move(device)) + , size(size) + , ptr(ptr) + , memory(memory) + , osHandle(osHandle) + {} + ~SSharedCUDAMemory() override; }; - core::smart_refctd_ptr exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device); + struct SExternalCUDASemaphore : core::IReferenceCounted + { + core::smart_refctd_ptr device; + CUexternalSemaphore semaphore; + void* osHandle; + SExternalCUDASemaphore(core::smart_refctd_ptr device, CUexternalSemaphore semaphore, void* osHandle) + : device(std::move(device)) + , semaphore(semaphore) + , osHandle(osHandle) + {} + ~SExternalCUDASemaphore() override; + }; - CUresult importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr); - CUresult createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem); - CUresult releaseExportableMemory(SSharedCUDAMemory mem); + core::smart_refctd_ptr exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device); + CUresult importGPUBuffer(core::smart_refctd_ptr* outPtr, IGPUBuffer* buf); + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, IGPUSemaphore* sem); + CUresult createExportableMemory(core::smart_refctd_ptr* outMem, size_t size, size_t alignment); + protected: + friend struct SSharedCUDAMemory; + CUresult releaseExportableMemory(SSharedCUDAMemory* mem); + CUresult destroyExternalSemaphore(SExternalCUDASemaphore* sema); - struct SCUDACleaner : video::ICleanup, SSharedCUDAMemory + struct SCUDACleaner : video::ICleanup { - core::smart_refctd_ptr dev; - SCUDACleaner(SSharedCUDAMemory mem, core::smart_refctd_ptr&& dev) - : SSharedCUDAMemory{ mem } - , dev(std::move(dev)) + core::smart_refctd_ptr resource; + SCUDACleaner(core::smart_refctd_ptr resource) + : resource(std::move(resource)) { } - ~SCUDACleaner() override { - dev->releaseExportableMemory(*this); + resource = nullptr; } }; - CUresult reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr); + CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory); friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); ~CCUDADevice() = default; diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 5341563ea0..b6f98385bb 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -132,6 +132,10 @@ class CCUDAHandler : public core::IReferenceCounted ,cuMemRelease ,cuMemSetAccess ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index ef9bef6588..d2ff049dfd 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -19,6 +19,15 @@ namespace nbl::video struct NBL_API2 ICleanup { virtual ~ICleanup() = 0; + + std::unique_ptr next; + + static void chain(std::unique_ptr& first, std::unique_ptr&& next) + { + if (first) + return chain(first->next, std::move(next)); + first = std::move(next); + } }; //! Interface from which resources backed by IDeviceMemoryAllocation inherit from @@ -95,29 +104,9 @@ class IDeviceMemoryBacked : public IBackendObject void chainPreDestroyCleanup(std::unique_ptr first) { - if (!m_cachedCreationParams.preDestroyCleanup) - { - m_cachedCreationParams.preDestroyCleanup = std::move(first); - return; - } - - struct SChainedCleanup : ICleanup - { - std::unique_ptr first, next; - SChainedCleanup(std::unique_ptr&& first, std::unique_ptr&& next) - : first(std::move(first)) - , next(std::move(next)) - { } - ~SChainedCleanup() - { - first = nullptr; - next = nullptr; - } - }; - - m_cachedCreationParams.preDestroyCleanup = std::make_unique(std::move(first), std::move(m_cachedCreationParams.preDestroyCleanup)); + ICleanup::chain(m_cachedCreationParams.preDestroyCleanup, std::move(first)); } - + protected: inline IDeviceMemoryBacked(core::smart_refctd_ptr&& originDevice, SCreationParams&& creationParams, const SDeviceMemoryRequirements& reqs) : IBackendObject(std::move(originDevice)), m_cachedCreationParams(std::move(creationParams)), m_cachedMemoryReqs(reqs) {} diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index cb6dc31728..c83fb562ba 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -17,7 +17,7 @@ CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConn } -CUresult CCUDADevice::reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr) +CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory) { auto& cu = m_handler->getCUDAFunctionTable(); @@ -50,16 +50,25 @@ CUresult CCUDADevice::reserveAdrressAndMapMemory(size_t size, size_t alignment, return CUDA_SUCCESS; } -CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory mem) +CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory* mem) { auto& cu = m_handler->getCUDAFunctionTable(); - if (auto err = cu.pcuMemUnmap(mem.ptr, mem.size); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemAddressFree(mem.ptr, mem.size); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemRelease(mem.memory); CUDA_SUCCESS != err) return err; - CloseHandle(mem.osHandle); + if (auto err = cu.pcuMemUnmap(mem->ptr, mem->size); CUDA_SUCCESS != err) return err; + if (auto err = cu.pcuMemAddressFree(mem->ptr, mem->size); CUDA_SUCCESS != err) return err; + if (auto err = cu.pcuMemRelease(mem->memory); CUDA_SUCCESS != err) return err; + CloseHandle(mem->osHandle); + return CUDA_SUCCESS; } -CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem) +CUresult CCUDADevice::destroyExternalSemaphore(SExternalCUDASemaphore* sema) +{ + auto& cu = m_handler->getCUDAFunctionTable(); + if (auto err = cu.pcuDestroyExternalSemaphore(sema->semaphore); CUDA_SUCCESS != err) return err; + CloseHandle(sema->osHandle); + return CUDA_SUCCESS; +} + +CUresult CCUDADevice::createExportableMemory(core::smart_refctd_ptr* outMem, size_t size, size_t alignment) { if (!outMem) return CUDA_ERROR_INVALID_VALUE; @@ -93,24 +102,32 @@ CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSha return err; } - if (auto err = reserveAdrressAndMapMemory(size, alignment, mem, &ptr); CUDA_SUCCESS != err) + if (auto err = reserveAdrressAndMapMemory(&ptr, size, alignment, mem); CUDA_SUCCESS != err) { CloseHandle(handle); cu.pcuMemRelease(mem); return err; } - outMem->size = size; - outMem->memory = mem; - outMem->ptr = ptr; - outMem->osHandle = handle; + *outMem = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), size, ptr, mem, handle); + return CUDA_SUCCESS; } -core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device) +CCUDADevice::SSharedCUDAMemory::~SSharedCUDAMemory() { + device->releaseExportableMemory(this); +} + +CCUDADevice::SExternalCUDASemaphore::~SExternalCUDASemaphore() +{ + device->destroyExternalSemaphore(this); +} - if (!device || !mem.memory || !mem.osHandle || !mem.ptr || !mem.size) +core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device) +{ + + if (!device || !mem || !mem->memory || !mem->osHandle || !mem->ptr || !mem->size) return nullptr; { @@ -123,9 +140,17 @@ core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemor return nullptr; } - auto buf = device->createBuffer( - { {.size = mem.size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT }, - { {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}}}); + auto buf = device->createBuffer(IGPUBuffer::SCreationParams { + asset::IBuffer::SCreationParams{ + .size = mem->size, + .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT + }, + IDeviceMemoryBacked::SCreationParams{ + IDeviceMemoryBacked::SCachedCreationParams{ + .externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, + .externalHandle = mem->osHandle + } + }}); auto req = buf->getMemoryReqs(); req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); @@ -134,22 +159,22 @@ core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemor if (!(allocation.memory && allocation.offset != ILogicalDevice::InvalidMemoryOffset)) return nullptr; - buf->chainPreDestroyCleanup(std::make_unique(mem, core::smart_refctd_ptr(this))); + buf->chainPreDestroyCleanup(std::make_unique(core::smart_refctd_ptr(mem))); return buf; } -CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr) +CUresult CCUDADevice::importGPUBuffer(core::smart_refctd_ptr* outPtr, IGPUBuffer* buf) { if (!buf || !outPtr) return CUDA_ERROR_INVALID_VALUE; auto& params = buf->getCachedCreationParams(); - if (!params.externalMemoryHandType.value) + if (!params.externalHandleType.value) return CUDA_ERROR_INVALID_VALUE; CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { - .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, + .type = static_cast(params.externalHandleType.value), .handle = {.win32 = {.handle = buf->getExternalHandle()}}, .size = buf->getMemoryReqs().size, }; @@ -160,22 +185,47 @@ CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr auto& cu = m_handler->getCUDAFunctionTable(); if (auto err = cu.pcuMemImportFromShareableHandle(&mem, buf->getExternalHandle(), - static_cast(params.externalMemoryHandType.value)); + static_cast(params.externalHandleType.value)); CUDA_SUCCESS != err) return err; - if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem, &ptr)) + if(auto err = reserveAdrressAndMapMemory(&ptr, buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem)) { cu.pcuMemRelease(mem); return err; } - outPtr->ptr = ptr; - outPtr->memory = mem; - outPtr->size = buf->getSize(); - outPtr->osHandle = handle; + *outPtr = core::make_smart_refctd_ptr( + core::smart_refctd_ptr(this), + buf->getSize(), ptr, mem, handle); + + buf->chainPreDestroyCleanup(std::make_unique(*outPtr)); + return CUDA_SUCCESS; +} + +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, IGPUSemaphore* sema) +{ + if (!sema || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = sema->getCreationParams().externalHandleType.value; + auto handle = sema->getCreationParams().externalHandle; - buf->chainPreDestroyCleanup(std::make_unique(*outPtr, core::smart_refctd_ptr(this))); + if (!handleType || !handle) + return CUDA_ERROR_INVALID_VALUE; + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { + .type = static_cast(handleType), + .handle = {.win32 = {.handle = handle }}, + }; + + CUexternalSemaphore cusema; + if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + return err; + + *outPtr = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), cusema, handle); + sema->chainPreDestroyCleanup(std::make_unique(*outPtr)); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CVulkanBuffer.cpp b/src/nbl/video/CVulkanBuffer.cpp index c24e13ceb8..2e542944d3 100644 --- a/src/nbl/video/CVulkanBuffer.cpp +++ b/src/nbl/video/CVulkanBuffer.cpp @@ -8,7 +8,7 @@ namespace nbl::video CVulkanBuffer::~CVulkanBuffer() { preDestroyStep(); - if (m_cachedCreationParams.skipHandleDestroy) + if (!m_cachedCreationParams.skipHandleDestroy) { const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); auto* vk = vulkanDevice->getFunctionTable(); From 2f7b517dd7e59b070e0609127448e4fa4565a2a5 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 15 Jul 2023 23:21:31 +0300 Subject: [PATCH 08/62] get cuda interop working in vulkan_1_3 branch --- examples_tests | 2 +- include/nbl/video/CCUDADevice.h | 189 ++--------- include/nbl/video/CCUDAHandler.h | 14 +- include/nbl/video/CCUDASharedMemory.h | 74 +++++ include/nbl/video/CCUDASharedSemaphore.h | 49 +++ include/nbl/video/IDeviceMemoryAllocation.h | 95 ++++-- include/nbl/video/IDeviceMemoryAllocator.h | 51 ++- include/nbl/video/IDeviceMemoryBacked.h | 3 + include/nbl/video/ILogicalDevice.h | 4 +- include/nbl/video/ISemaphore.h | 41 ++- include/nbl/video/SPhysicalDeviceLimits.h | 3 + src/nbl/CMakeLists.txt | 7 +- .../asset/interchange/CPLYMeshFileLoader.cpp | 7 +- .../asset/interchange/CSTLMeshFileLoader.cpp | 8 +- src/nbl/video/CCUDADevice.cpp | 306 ++++-------------- src/nbl/video/CCUDAHandler.cpp | 15 +- src/nbl/video/CCUDASharedMemory.cpp | 109 +++++++ src/nbl/video/CCUDASharedSemaphore.cpp | 18 ++ src/nbl/video/CVulkanCommandBuffer.cpp | 2 +- src/nbl/video/CVulkanLogicalDevice.cpp | 70 +++- src/nbl/video/CVulkanLogicalDevice.h | 2 +- src/nbl/video/CVulkanMemoryAllocation.cpp | 14 +- src/nbl/video/CVulkanMemoryAllocation.h | 7 +- src/nbl/video/CVulkanPhysicalDevice.cpp | 3 + src/nbl/video/CVulkanQueue.cpp | 2 +- src/nbl/video/CVulkanSemaphore.h | 7 +- src/nbl/video/IGPUCommandBuffer.cpp | 4 +- 27 files changed, 623 insertions(+), 483 deletions(-) create mode 100644 include/nbl/video/CCUDASharedMemory.h create mode 100644 include/nbl/video/CCUDASharedSemaphore.h create mode 100644 src/nbl/video/CCUDASharedMemory.cpp create mode 100644 src/nbl/video/CCUDASharedSemaphore.cpp diff --git a/examples_tests b/examples_tests index faddda46b2..6ce21d5c5c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit faddda46b285b433c2926b384064bd80a6889b43 +Subproject commit 6ce21d5c5c8026b6772f3e60e21096ee54353a81 diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 26005fcba3..7b2b952548 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,7 +6,8 @@ #include "nbl/video/IPhysicalDevice.h" - +#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDASharedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -23,10 +24,20 @@ namespace nbl::video { class CCUDAHandler; +class CCUDASharedMemory; +class CCUDASharedSemaphore; class CCUDADevice : public core::IReferenceCounted { public: +#ifdef _WIN32 + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; +#else + static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; + static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif + enum E_VIRTUAL_ARCHITECTURE { EVA_30, @@ -72,181 +83,45 @@ class CCUDADevice : public core::IReferenceCounted // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability // Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions) // Also maybe separate this out into its own `CCUDA` class instead of nesting it here? -#if 0 - template - struct GraphicsAPIObjLink - { - GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false) - { - asImage = {nullptr}; - } - GraphicsAPIObjLink(core::smart_refctd_ptr&& _obj) : GraphicsAPIObjLink() - { - obj = std::move(_obj); - } - GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink() - { - operator=(std::move(other)); - } - - GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other) - { - std::swap(obj,other.obj); - std::swap(cudaHandle,other.cudaHandle); - std::swap(acquired,other.acquired); - std::swap(asImage,other.asImage); - return *this; - } - - ~GraphicsAPIObjLink() - { - assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream - if (obj) - CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle); - } - - // - auto* getObject() const {return obj.get();} - - private: - core::smart_refctd_ptr obj; - CUgraphicsResource cudaHandle; - bool acquired; - - friend class CCUDAHandler; - public: - union - { - struct - { - CUdeviceptr pointer; - } asBuffer; - struct - { - CUmipmappedArray mipmappedArray; - CUarray array; - } asImage; - }; - }; - - // - static CUresult registerBuffer(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - static CUresult registerImage(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - - template - static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsMapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = true; - return retval; - } - template - static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = false; - return retval; - } - - static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); - static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); -#endif CUdevice getInternalObject() const { return m_handle; } const CCUDAHandler* getHandler() const { return m_handler.get(); } - - struct SSharedCUDAMemory : core::IReferenceCounted - { - core::smart_refctd_ptr device; - size_t size; - CUdeviceptr ptr; - CUmemGenericAllocationHandle memory; - void* osHandle; - SSharedCUDAMemory(core::smart_refctd_ptr device, size_t size, CUdeviceptr ptr, CUmemGenericAllocationHandle memory, void* osHandle) - : device(std::move(device)) - , size(size) - , ptr(ptr) - , memory(memory) - , osHandle(osHandle) - {} - ~SSharedCUDAMemory() override; - }; - - struct SExternalCUDASemaphore : core::IReferenceCounted - { - core::smart_refctd_ptr device; - CUexternalSemaphore semaphore; - void* osHandle; - SExternalCUDASemaphore(core::smart_refctd_ptr device, CUexternalSemaphore semaphore, void* osHandle) - : device(std::move(device)) - , semaphore(semaphore) - , osHandle(osHandle) - {} - ~SExternalCUDASemaphore() override; - }; - - core::smart_refctd_ptr exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device); - CUresult importGPUBuffer(core::smart_refctd_ptr* outPtr, IGPUBuffer* buf); - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, IGPUSemaphore* sem); - CUresult createExportableMemory(core::smart_refctd_ptr* outMem, size_t size, size_t alignment); + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + size_t roundToGranularity(CUmemLocationType location, size_t size) const; + protected: - friend struct SSharedCUDAMemory; - CUresult releaseExportableMemory(SSharedCUDAMemory* mem); - CUresult destroyExternalSemaphore(SExternalCUDASemaphore* sema); + CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + + friend class CCUDAHandler; + friend class CCUDASharedMemory; + friend class CCUDASharedSemaphore; struct SCUDACleaner : video::ICleanup { - core::smart_refctd_ptr resource; - SCUDACleaner(core::smart_refctd_ptr resource) + core::smart_refctd_ptr resource; + SCUDACleaner(core::smart_refctd_ptr resource) : resource(std::move(resource)) { } - ~SCUDACleaner() override - { - resource = nullptr; - } }; - - CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory); - friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); - ~CCUDADevice() = default; + + CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr&& _handler); + ~CCUDADevice(); std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; IPhysicalDevice* const m_vulkanDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + core::smart_refctd_ptr m_handler; + CUdevice m_handle; + CUcontext m_context; + size_t m_allocationGranularity[4]; }; } #endif // _NBL_COMPILE_WITH_CUDA_ -#endif +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index b6f98385bb..dbad47877d 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -34,7 +34,7 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; @@ -174,9 +174,9 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } @@ -243,8 +243,7 @@ class CCUDAHandler : public core::IReferenceCounted } core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); - - protected: +protected: CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) { @@ -256,7 +255,8 @@ class CCUDAHandler : public core::IReferenceCounted } } ~CCUDAHandler() = default; - + + // inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) { @@ -289,4 +289,4 @@ class CCUDAHandler : public core::IReferenceCounted #endif // _NBL_COMPILE_WITH_CUDA_ -#endif +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h new file mode 100644 index 0000000000..9b3e4a0551 --- /dev/null +++ b/include/nbl/video/CCUDASharedMemory.h @@ -0,0 +1,74 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ + + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDAMemoryMapping: public core::IReferenceCounted +{ +}; + +class CCUDASharedMemory : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUdeviceptr getDeviceptr() const { return m_params.ptr; } + + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; + + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + union + { + void* osHandle; + int fd; + }; + }; + + const SCreationParams& getCreationParams() const { return m_params; } + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + core::smart_refctd_ptr exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; + +protected: + + CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + : m_device(std::move(device)) + , m_params(std::move(params)) + {} + ~CCUDASharedMemory() override; + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h new file mode 100644 index 0000000000..882e794bd4 --- /dev/null +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDASharedSemaphore : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUexternalSemaphore getInternalObject() const { return m_handle; } + +protected: + + CCUDASharedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, CUexternalSemaphore semaphore, void* osHandle) + : m_device(std::move(device)) + , m_src(std::move(m_src)) + , m_handle(semaphore) + , m_osHandle(osHandle) + {} + ~CCUDASharedSemaphore() override; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; + void* m_osHandle; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 7365fa6339..7074f8861b 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,6 +24,8 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted { + friend class IDeviceMemoryAllocator; + friend class ILogicalDevice; public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) @@ -68,6 +70,43 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted EMHF_MULTI_INSTANCE_BIT = 0x00000002, }; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D11_TEXTURE = 0x00000008, + EHT_D3D11_TEXTURE_KMT = 0x00000010, + EHT_D3D12_HEAP = 0x00000020, + EHT_D3D12_RESOURCE = 0x00000040, + EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + }; + + /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */ + struct SExternalMemoryProperties + { + uint32_t exportableTypes : 7 = ~0u; + uint32_t compatibleTypes : 7 = ~0u; + uint32_t dedicatedOnly : 1 = 0u; + uint32_t exportable : 1 = ~0u; + uint32_t importable : 1 = ~0u; + + bool operator == (SExternalMemoryProperties const& rhs) const = default; + + SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const + { + rhs.exportableTypes &= exportableTypes; + rhs.compatibleTypes &= compatibleTypes; + rhs.dedicatedOnly |= dedicatedOnly; + rhs.exportable &= exportable; + rhs.importable &= importable; + return rhs; + } + }; + + static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); + // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} @@ -75,25 +114,25 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. - inline bool isDedicated() const {return m_dedicated;} + inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation - inline size_t getAllocationSize() const {return m_allocationSize;} + inline size_t getAllocationSize() const {return m_params.allocationSize;} //! - inline core::bitflag getAllocateFlags() const { return m_allocateFlags; } + inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! - inline core::bitflag getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } + inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) - inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} + inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more) for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */ inline bool haveToMakeVisible() const { - return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); + return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); } //! @@ -106,9 +145,9 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted { if (isCurrentlyMapped()) return false; - if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) + if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) return false; - if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) + if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) return false; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) @@ -149,23 +188,41 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } + struct SCreationParams + { + core::bitflag allocateFlags = E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE; + core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; + E_EXTERNAL_HANDLE_TYPE externalHandleType = E_EXTERNAL_HANDLE_TYPE::EHT_NONE; + void* externalHandle = nullptr; + const bool dedicated = false; + const size_t allocationSize; + }; + protected: - inline IDeviceMemoryAllocation( - const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag allocateFlags, const core::bitflag memoryPropertyFlags, const bool dedicated - ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {} + inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) + { + m_postDestroyCleanup = std::move(cleanup); + } + + IDeviceMemoryAllocation( + const ILogicalDevice* originDevice, SCreationParams&& params = {}) + : m_originDevice(originDevice) + , m_mappedPtr(nullptr) + , m_mappedRange{ 0, 0 } + , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) + , m_params(std::move(params)) + {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - const ILogicalDevice* const m_originDevice; - const size_t m_allocationSize; - uint8_t* m_mappedPtr = nullptr; - MemoryRange m_mappedRange = {}; - core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - const core::bitflag m_allocateFlags; - const core::bitflag m_memoryPropertyFlags; - const bool m_dedicated; + const ILogicalDevice* m_originDevice = nullptr; + uint8_t* m_mappedPtr; + MemoryRange m_mappedRange; + core::bitflag m_currentMappingAccess; + SCreationParams m_params; + std::unique_ptr m_postDestroyCleanup = nullptr; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 0712ec24f6..408efd6da4 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -19,6 +19,12 @@ class IDeviceMemoryAllocator size_t memoryTypeIndex : 5 = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. + + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE + void* externalHandle = nullptr; }; //! IMemoryTypeIterator extracts memoryType indices from memoryTypeBits in arbitrary order @@ -27,8 +33,15 @@ class IDeviceMemoryAllocator class IMemoryTypeIterator { public: - IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) - : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs) {} + IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + void* handle) + : m_allocateFlags(static_cast(allocateFlags.value)) + , m_reqs(reqs) + , m_handleType(handleType) + , m_handle(handle) + {} static inline uint32_t end() {return 32u;} @@ -40,11 +53,13 @@ class IDeviceMemoryAllocator inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) { - SAllocateInfo ret; + SAllocateInfo ret = {}; ret.size = m_reqs.size; ret.flags = m_allocateFlags; ret.memoryTypeIndex = dereference(); ret.dedication = dedication; + ret.externalHandleType = m_handleType; + ret.externalHandle = m_handle; return ret; } @@ -57,17 +72,24 @@ class IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; + void* m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB class DefaultMemoryTypeIterator : public IMemoryTypeIterator { public: - DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags) + DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + void* handle) + : IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } + protected: uint32_t dereference() const override { @@ -100,19 +122,26 @@ class IDeviceMemoryAllocator }; virtual SAllocation allocate(const SAllocateInfo& info) = 0; - template - inline SAllocation allocate( - const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE) + template + SAllocation allocate( + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + IDeviceMemoryBacked* dedication = nullptr, + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType = IDeviceMemoryAllocation::EHT_NONE, + void* handle = nullptr, + std::unique_ptr&& postDestroyCleanup = nullptr) { - for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, handleType, handle); memTypeIt != IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); - auto allocation = allocate(allocateInfo); + SAllocation allocation = allocate(allocateInfo); if (allocation.isValid()) + { + allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup)); return allocation; + } } - return {}; + return { }; } }; diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index d2ff049dfd..278e681a35 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -46,6 +46,8 @@ class IDeviceMemoryBacked : public IBackendObject // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects bool skipHandleDestroy = false; + core::bitflag externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE; + //! If you specify queue family indices, then you're concurrent sharing inline bool isConcurrentSharing() const { @@ -125,6 +127,7 @@ class IDeviceMemoryBacked : public IBackendObject //! members SCachedCreationParams m_cachedCreationParams; SDeviceMemoryRequirements m_cachedMemoryReqs; + void* m_cachedExternalHandle = nullptr; }; } // end namespace nbl::video diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 278390939d..d4cdc6fd99 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -85,7 +85,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe bool supportsMask(const uint32_t queueFamilyIndex, core::bitflag accessMask) const; //! NOTE/TODO: this is not yet finished - inline bool validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const; + bool validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const; inline bool validateMemoryBarrier(const uint32_t queueFamilyIndex, const IGPUCommandBuffer::SOwnershipTransferBarrier& barrier, const bool concurrentSharing) const { // implicitly satisfied by our API: @@ -147,7 +147,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual IQueue::RESULT waitIdle() const = 0; //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&& ) = 0; // struct SSemaphoreWaitInfo { diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index dae0efe1bf..768fe1a66d 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -26,9 +26,48 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0x00000000, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D12_FENCE = 0x00000008, + EHT_SYNC_FD = 0x00000010, + }; + + //! + struct SCreationParams + { + // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources + std::unique_ptr preDestroyCleanup = nullptr; + // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources + std::unique_ptr postDestroyCleanup = nullptr; + // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects + bool skipHandleDestroy = false; + // Handle Type for external resources + core::bitflag externalHandleTypes = EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE + void* externalHandle = nullptr; + + uint64_t initialValue = 0; + }; + + auto const& getCreationParams() const + { + return m_creationParams; + } + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev) : IBackendObject(std::move(dev)) {} + ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& params = {}) + : IBackendObject(std::move(dev)) + , m_creationParams(std::move(params)) + {} virtual ~ISemaphore() = default; + + SCreationParams m_creationParams; }; } diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h index fe263aed84..7f58a67443 100644 --- a/include/nbl/video/SPhysicalDeviceLimits.h +++ b/include/nbl/video/SPhysicalDeviceLimits.h @@ -552,6 +552,9 @@ struct SPhysicalDeviceLimits /* CooperativeMatrixPropertiesKHR *//* VK_KHR_cooperative_matrix */ core::bitflag cooperativeMatrixSupportedStages = asset::IShader::ESS_UNKNOWN; + bool externalFenceWin32 = false; /* VK_KHR_external_fence_win32 */ // [TODO] requires instance extensions, add them + bool externalMemoryWin32 = false; /* VK_KHR_external_memory_win32 */ // [TODO] requires instance extensions, add them + bool externalSemaphoreWin32 = false; /* VK_KHR_external_semaphore_win32 */ // [TODO] requires instance extensions, add them /* Always enabled if available, reported as limits */ diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index ac0aa0c42d..dc9f4e7bef 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -119,7 +119,6 @@ nbl_get_conf_dir(NABLA_CONF_DIR_RELEASE Release) nbl_get_conf_dir(NABLA_CONF_DIR_RELWITHDEBINFO RelWithDebInfo) if (NBL_COMPILE_WITH_CUDA) - message(STATUS "Building with CUDA interop") set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) if (NBL_BUILD_OPTIX) set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) @@ -320,6 +319,8 @@ set(NBL_VIDEO_SOURCES # CUDA ${NBL_ROOT_PATH}/src/nbl/video/CCUDAHandler.cpp ${NBL_ROOT_PATH}/src/nbl/video/CCUDADevice.cpp + ${NBL_ROOT_PATH}/src/nbl/video/CCUDASharedMemory.cpp + ${NBL_ROOT_PATH}/src/nbl/video/CCUDASharedSemaphore.cpp ) set(NBL_SCENE_SOURCES @@ -384,6 +385,10 @@ endif() target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__) +if (NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) +endif() + if (ANDROID) add_library(android_native_app_glue STATIC ${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp index 4ad0710dbf..6eb93d7242 100644 --- a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp @@ -96,6 +96,7 @@ void CPLYMeshFileLoader::initialize() auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); if (pipelineBundle.getContents().empty()) { +#if 0 // WHAT IS THIS? auto mbVertexShader = core::smart_refctd_ptr(); auto mbFragmentShader = core::smart_refctd_ptr(); { @@ -108,6 +109,7 @@ void CPLYMeshFileLoader::initialize() mbVertexShader = core::smart_refctd_ptr_static_cast(vertexShaderBundle->begin()->getContents().begin()[0]); mbFragmentShader = core::smart_refctd_ptr_static_cast(fragmentShaderBundle->begin()->getContents().begin()[0]); } +#endif auto mbPipelineLayout = defaultOverride.findDefaultAsset("nbl/builtin/pipeline_layout/loader/PLY", fakeContext, 0u).first; @@ -130,7 +132,7 @@ void CPLYMeshFileLoader::initialize() const auto currentBitmask = core::createBitmask({ attrib }); inputParams.enabledBindingFlags |= currentBitmask; inputParams.enabledAttribFlags |= currentBitmask; - inputParams.bindings[attrib] = { asset::getTexelOrBlockBytesize(static_cast(vertexAttribParamsAllOptions[attrib].format)), EVIR_PER_VERTEX }; + inputParams.bindings[attrib] = { asset::getTexelOrBlockBytesize(static_cast(vertexAttribParamsAllOptions[attrib].format)), SVertexInputBindingParams::EVIR_PER_VERTEX}; inputParams.attributes[attrib] = vertexAttribParamsAllOptions[attrib]; } @@ -143,14 +145,15 @@ void CPLYMeshFileLoader::initialize() SRasterizationParams rastarizationParmas; +#if 0 // WHAT IS THIS? auto mbPipeline = core::make_smart_refctd_ptr(std::move(mbPipelineLayout), nullptr, nullptr, inputParams, blendParams, primitiveAssemblyParams, rastarizationParmas); { mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get()); mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get()); - asset::SAssetBundle newPipelineBundle(nullptr, { core::smart_refctd_ptr(mbPipeline) }); defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); } +#endif } else return; diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp index c080857c63..b507153916 100644 --- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp @@ -52,6 +52,7 @@ void CSTLMeshFileLoader::initialize() auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); if (pipelineBundle.getContents().empty()) { +#if 0 // WHAT IS THIS? auto mbVertexShader = core::smart_refctd_ptr(); auto mbFragmentShader = core::smart_refctd_ptr(); { @@ -64,7 +65,7 @@ void CSTLMeshFileLoader::initialize() mbVertexShader = core::smart_refctd_ptr_static_cast(vertexShaderBundle->begin()->getContents().begin()[0]); mbFragmentShader = core::smart_refctd_ptr_static_cast(fragmentShaderBundle->begin()->getContents().begin()[0]); } - +#endif auto defaultOverride = IAssetLoaderOverride(m_assetMgr); const IAssetLoader::SAssetLoadContext fakeContext(IAssetLoader::SAssetLoadParams{}, nullptr); @@ -79,7 +80,7 @@ void CSTLMeshFileLoader::initialize() const auto stride = positionFormatByteSize + colorFormatByteSize + normalFormatByteSize; mbInputParams.enabledBindingFlags |= core::createBitmask({ 0 }); mbInputParams.enabledAttribFlags |= core::createBitmask({ POSITION_ATTRIBUTE, NORMAL_ATTRIBUTE, withColorAttribute ? COLOR_ATTRIBUTE : 0 }); - mbInputParams.bindings[0] = { stride, EVIR_PER_VERTEX }; + mbInputParams.bindings[0] = { stride, SVertexInputBindingParams::EVIR_PER_VERTEX }; mbInputParams.attributes[POSITION_ATTRIBUTE].format = EF_R32G32B32_SFLOAT; mbInputParams.attributes[POSITION_ATTRIBUTE].relativeOffset = 0; @@ -102,14 +103,15 @@ void CSTLMeshFileLoader::initialize() SRasterizationParams rastarizationParmas; +#if 0 // WHAT IS THIS? auto mbPipeline = core::make_smart_refctd_ptr(std::move(mbPipelineLayout), nullptr, nullptr, mbInputParams, blendParams, primitiveAssemblyParams, rastarizationParmas); { mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get()); mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get()); } - asset::SAssetBundle newPipelineBundle(nullptr, {core::smart_refctd_ptr(mbPipeline)}); defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); +#endif } else return; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index c83fb562ba..39faaaa0ed 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -1,40 +1,68 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CCUDAHandler.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture) - : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture) +CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr&& _handler) + : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture), m_handle(_handle), m_handler(std::move(_handler)), m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); + auto& cu = m_handler->getCUDAFunctionTable(); + + CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle); + assert(CUDA_SUCCESS == re); + re = cu.pcuCtxSetCurrent(m_context); + assert(CUDA_SUCCESS == re); + + for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) + { + uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = {.type = static_cast(i), .id = m_handle }, + .win32HandleMetaData = metaData, + }; + auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + + assert(CUDA_SUCCESS == re); + } + +} + +CCUDADevice::~CCUDADevice() +{ + m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); } +size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const +{ + return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; +} -CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory) +CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { auto& cu = m_handler->getCUDAFunctionTable(); CUdeviceptr ptr = 0; if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) - { return err; - } if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { cu.pcuMemAddressFree(ptr, size); return err; } - + CUmemAccessDesc accessDesc = { - .location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle }, + .location = { .type = location, .id = m_handle }, .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, }; @@ -50,173 +78,70 @@ CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory* mem) -{ - auto& cu = m_handler->getCUDAFunctionTable(); - if (auto err = cu.pcuMemUnmap(mem->ptr, mem->size); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemAddressFree(mem->ptr, mem->size); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemRelease(mem->memory); CUDA_SUCCESS != err) return err; - CloseHandle(mem->osHandle); - return CUDA_SUCCESS; -} - -CUresult CCUDADevice::destroyExternalSemaphore(SExternalCUDASemaphore* sema) -{ - auto& cu = m_handler->getCUDAFunctionTable(); - if (auto err = cu.pcuDestroyExternalSemaphore(sema->semaphore); CUDA_SUCCESS != err) return err; - CloseHandle(sema->osHandle); - return CUDA_SUCCESS; -} - -CUresult CCUDADevice::createExportableMemory(core::smart_refctd_ptr* outMem, size_t size, size_t alignment) +CUresult CCUDADevice::createSharedMemory( + core::smart_refctd_ptr* outMem, + CCUDASharedMemory::SCreationParams&& inParams) { if (!outMem) return CUDA_ERROR_INVALID_VALUE; + CCUDASharedMemory::SCachedCreationParams params = { inParams }; + auto& cu = m_handler->getCUDAFunctionTable(); uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32, - .location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle }, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = params.location, .id = m_handle }, .win32HandleMetaData = metaData, }; - - size_t granularity = 0; - if (auto err = cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); CUDA_SUCCESS != err) - return err; - - size = ((size - 1) / granularity + 1) * granularity; - CUmemGenericAllocationHandle mem = 0; - void* handle = 0; - CUdeviceptr ptr = 0; + params.granularSize = roundToGranularity(params.location, params.size); - if(auto err = cu.pcuMemCreate(&mem, size, &prop, 0); CUDA_SUCCESS != err) + CUmemGenericAllocationHandle mem; + if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) return err; - - if (auto err = cu.pcuMemExportToShareableHandle(&handle, mem, CU_MEM_HANDLE_TYPE_WIN32, 0); CUDA_SUCCESS != err) + + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.osHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { cu.pcuMemRelease(mem); return err; } - if (auto err = reserveAdrressAndMapMemory(&ptr, size, alignment, mem); CUDA_SUCCESS != err) + if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseHandle(handle); + CloseHandle(params.osHandle); cu.pcuMemRelease(mem); return err; } - *outMem = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), size, ptr, mem, handle); - - return CUDA_SUCCESS; -} - -CCUDADevice::SSharedCUDAMemory::~SSharedCUDAMemory() -{ - device->releaseExportableMemory(this); -} - -CCUDADevice::SExternalCUDASemaphore::~SExternalCUDASemaphore() -{ - device->destroyExternalSemaphore(this); -} - -core::smart_refctd_ptr CCUDADevice::exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device) -{ - - if (!device || !mem || !mem->memory || !mem->osHandle || !mem->ptr || !mem->size) - return nullptr; - + if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CUuuid id; - // TODO(Atil): Cache properties - if (CUDA_SUCCESS != m_handler->getCUDAFunctionTable().pcuDeviceGetUuid(&id, m_handle)) - return nullptr; - - if (memcmp(&id, device->getPhysicalDevice()->getProperties().deviceUUID, 16)) - return nullptr; - } - - auto buf = device->createBuffer(IGPUBuffer::SCreationParams { - asset::IBuffer::SCreationParams{ - .size = mem->size, - .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT - }, - IDeviceMemoryBacked::SCreationParams{ - IDeviceMemoryBacked::SCachedCreationParams{ - .externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, - .externalHandle = mem->osHandle - } - }}); - - auto req = buf->getMemoryReqs(); - req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto allocation = device->allocate(req, buf.get()); - - if (!(allocation.memory && allocation.offset != ILogicalDevice::InvalidMemoryOffset)) - return nullptr; - - buf->chainPreDestroyCleanup(std::make_unique(core::smart_refctd_ptr(mem))); - return buf; -} - -CUresult CCUDADevice::importGPUBuffer(core::smart_refctd_ptr* outPtr, IGPUBuffer* buf) -{ - if (!buf || !outPtr) - return CUDA_ERROR_INVALID_VALUE; - - auto& params = buf->getCachedCreationParams(); - - if (!params.externalHandleType.value) - return CUDA_ERROR_INVALID_VALUE; - - CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { - .type = static_cast(params.externalHandleType.value), - .handle = {.win32 = {.handle = buf->getExternalHandle()}}, - .size = buf->getMemoryReqs().size, - }; - - CUmemGenericAllocationHandle mem = 0; - CUdeviceptr ptr = 0; - void* handle = handleDesc.handle.win32.handle; - - auto& cu = m_handler->getCUDAFunctionTable(); - if (auto err = cu.pcuMemImportFromShareableHandle(&mem, buf->getExternalHandle(), - static_cast(params.externalHandleType.value)); - CUDA_SUCCESS != err) - return err; - - if(auto err = reserveAdrressAndMapMemory(&ptr, buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem)) - { - cu.pcuMemRelease(mem); + CloseHandle(params.osHandle); return err; } - - *outPtr = core::make_smart_refctd_ptr( - core::smart_refctd_ptr(this), - buf->getSize(), ptr, mem, handle); - buf->chainPreDestroyCleanup(std::make_unique(*outPtr)); + *outMem = core::smart_refctd_ptr(new CCUDASharedMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + return CUDA_SUCCESS; } -CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, IGPUSemaphore* sema) +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; auto& cu = m_handler->getCUDAFunctionTable(); - auto handleType = sema->getCreationParams().externalHandleType.value; + auto handleType = sema->getCreationParams().externalHandleTypes; auto handle = sema->getCreationParams().externalHandle; - if (!handleType || !handle) + if (!handleType.hasFlags(ISemaphore::EHT_OPAQUE_WIN32) || !handle) return CUDA_ERROR_INVALID_VALUE; CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { - .type = static_cast(handleType), + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, .handle = {.win32 = {.handle = handle }}, }; @@ -224,120 +149,11 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(core::smart_refctd_ptr(this), cusema, handle); - sema->chainPreDestroyCleanup(std::make_unique(*outPtr)); - return CUDA_SUCCESS; -} - -#if 0 -CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) -{ - assert(link->obj); - auto glbuf = static_cast(link->obj.get()); - auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); - if (retval!=CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} -CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink* link, uint32_t flags) -{ - assert(link->obj); - - auto format = link->obj->getCreationParameters().format; - if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format)) - return CUDA_ERROR_INVALID_IMAGE; - - auto glimg = static_cast(link->obj.get()); - GLenum target = glimg->getOpenGLTarget(); - switch (target) - { - case GL_TEXTURE_2D: - case GL_TEXTURE_2D_ARRAY: - case GL_TEXTURE_CUBE_MAP: - case GL_TEXTURE_3D: - break; - default: - return CUDA_ERROR_INVALID_IMAGE; - break; - } - auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags); - if (retval != CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} - - -constexpr auto MaxAquireOps = 4096u; - -CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, handle), core::dont_grab); return CUDA_SUCCESS; } -CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -#endif } -#endif // _NBL_COMPILE_WITH_CUDA_ +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 183afe6b43..09c2fbe14e 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDAHandler.h" +#include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "jitify/jitify.hpp" @@ -410,7 +411,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? - const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; + const char* nvrtc64_versions[] = { "nvrtc64_120", "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; for (auto verpath=nvrtc64_versions; *verpath; verpath++) { @@ -468,7 +469,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { const void* contents = it.second.data(); headers.push_back(core::make_smart_refctd_ptr>( - core::smart_refctd_ptr(system),it.first.c_str(), + it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, const_cast(contents),it.second.size()+1u )); @@ -514,7 +515,8 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; auto ptx = core::make_smart_refctd_ptr(_size); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; + nvrtcResult result = m_nvrtc.pnvrtcGetPTX(prog, reinterpret_cast(ptx->getPointer())); + return {std::move(ptx),result}; } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -538,7 +540,8 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) + + if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); + auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,handle,core::smart_refctd_ptr(this)); return core::smart_refctd_ptr(device,core::dont_grab); } } @@ -633,4 +636,4 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct } -#endif // _NBL_COMPILE_WITH_CUDA_ +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp new file mode 100644 index 0000000000..3ebb8e211d --- /dev/null +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ + +core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +{ + IDeviceMemoryAllocator::SAllocateInfo info = { + .size = m_params.granularSize, + .externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + .externalHandle = m_params.osHandle, + }; + + auto pd = device->getPhysicalDevice(); + uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; + uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); + + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; + case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; + req.size = m_params.granularSize; + req.memoryTypeBits = memoryTypeBits; + req.prefersDedicatedAllocation = nullptr != dedication; + req.requiresDedicatedAllocation = nullptr != dedication; + + return device->allocate(req, + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, m_params.osHandle, + std::make_unique(core::smart_refctd_ptr(this))).memory; +} + +#if 0 +core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag usage) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + auto buf = device->createBuffer({{ + .size = m_params.granularSize, + .usage = usage }, {{ + .postDestroyCleanup = std::make_unique(core::smart_refctd_ptr(this)), + .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + .externalHandle = m_params.osHandle + }}}); + + auto req = buf->getMemoryReqs(); + auto pd = device->getPhysicalDevice(); + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break; + case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + if (!device->allocate(req, buf.get()).isValid()) + return nullptr; + + return buf; +} + +#endif + +core::smart_refctd_ptr CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + auto img = device->createImage({ + std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }}, + IGPUImage::TILING::LINEAR, + 1 /*preinitialized*/, + }); + + if (exportAsMemory(device, img.get())) + return img; + + return nullptr; +} + +CCUDASharedMemory::~CCUDASharedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + + CUresult re[] = { + cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), + }; + CloseHandle(m_params.osHandle); + +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp new file mode 100644 index 0000000000..4d6d3aacc9 --- /dev/null +++ b/src/nbl/video/CCUDASharedSemaphore.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDASharedSemaphore::~CCUDASharedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + cu.pcuDestroyExternalSemaphore(m_handle); + CloseHandle(m_osHandle); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index 55e4c90dab..ed2e3e0fab 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -210,7 +210,7 @@ bool CVulkanCommandBuffer::pipelineBarrier_impl(const core::bitflaggetQueueFamilyIndex()); info.dependencyFlags = static_cast(dependencyFlags.value); - getFunctionTable().vkCmdPipelineBarrier2KHR(m_cmdbuf,&info); + getFunctionTable().vkCmdPipelineBarrier2(m_cmdbuf,&info); return true; } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 7041e4bad7..752645f633 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -45,21 +45,39 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(ISemaphore::SCreationParams&& params) { + VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; + VkExportSemaphoreWin32HandleInfoKHR handleInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL }; + VkExportSemaphoreCreateInfo exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, &handleInfo, static_cast(params.externalHandleTypes.value) }; + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = params.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; - type.initialValue = initialValue; + type.initialValue = params.initialValue; - VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,&type }; + VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, &type }; createInfo.flags = static_cast(0); // flags must be 0 VkSemaphore semaphore; - if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS) - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this),semaphore); - else + if (VK_SUCCESS != m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore)) return nullptr; + + if (params.externalHandleTypes.value) + { + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = static_cast(params.externalHandleTypes.value), + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, ¶ms.externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + return nullptr; + } + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), semaphore, std::move(params)); } auto CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) -> WAIT_RESULT { @@ -138,11 +156,32 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; + VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo }; + vk_allocateInfo.allocationSize = info.size; + vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; + + VkImportMemoryWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .handle = info.externalHandle + }; + + const void** pNext = &vk_allocateFlagsInfo.pNext; + + if (info.externalHandleType) + { + // Importing + *pNext = &importInfo; + pNext = &importInfo.pNext; + } + if(info.dedication) { // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0)); - vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo; + *pNext = &vk_dedicatedInfo; + pNext = &vk_dedicatedInfo.pNext; + switch (info.dedication->getObjectType()) { case IDeviceMemoryBacked::EOT_BUFFER: @@ -157,9 +196,6 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; } } - VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo}; - vk_allocateInfo.allocationSize = info.size; - vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); @@ -168,7 +204,17 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - ret.memory = core::make_smart_refctd_ptr(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory); + + CVulkanMemoryAllocation::SCreationParams params = { + .allocateFlags = allocateFlags, + .memoryPropertyFlags = memoryPropertyFlags, + .externalHandleType = info.externalHandleType, + .externalHandle = info.externalHandle, + .dedicated = !!info.dedication, + .allocationSize = info.size, + }; + + ret.memory = core::make_smart_refctd_ptr(this,vk_deviceMemory, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index b83a8cc803..d8f934ceb9 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -52,7 +52,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice return CVulkanQueue::getResultFrom(m_devf.vk.vkDeviceWaitIdle(m_vkdev)); } - core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&&) override; WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 5a4dfd5ff5..fb214c897e 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -4,11 +4,15 @@ namespace nbl::video { CVulkanMemoryAllocation::CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle -) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params +) + : IDeviceMemoryAllocation(dev,std::move(params)) + , m_vulkanDevice(dev) + , m_deviceMemoryHandle(deviceMemoryHandle) +{ +} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 470e914ae3..d9508411b0 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -15,10 +15,9 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation { public: CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 0f771a41ae..e457ae3a2b 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1201,6 +1201,9 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart if (isExtensionSupported(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME)) properties.limits.cooperativeMatrixRobustness = cooperativeMatrixFeatures.robustness; #endif + properties.limits.externalFenceWin32 = isExtensionSupported(VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME); + properties.limits.externalMemoryWin32 = isExtensionSupported(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); + properties.limits.externalSemaphoreWin32 = isExtensionSupported(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); } // we compare all limits against the defaults easily! diff --git a/src/nbl/video/CVulkanQueue.cpp b/src/nbl/video/CVulkanQueue.cpp index 2dd76a47a5..f94a4a8d7d 100644 --- a/src/nbl/video/CVulkanQueue.cpp +++ b/src/nbl/video/CVulkanQueue.cpp @@ -84,7 +84,7 @@ auto CVulkanQueue::submit_impl(const std::span _submi outSubmitInfo->signalSemaphoreInfoCount = fillSemaphoreInfo(submit.signalSemaphores,outSignalSemaphoreInfo); outSubmitInfo++; } - const auto vk_result = static_cast(m_originDevice)->getFunctionTable()->vk.vkQueueSubmit2KHR(m_vkQueue,submits.size(),submits.data(),VK_NULL_HANDLE); + const auto vk_result = static_cast(m_originDevice)->getFunctionTable()->vk.vkQueueSubmit2(m_vkQueue,submits.size(),submits.data(),VK_NULL_HANDLE); return getResultFrom(vk_result); } diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 9290110d8d..2beb7cb21b 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,11 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& dev, const VkSemaphore semaphore, SCreationParams&& params = {}) + : ISemaphore(std::move(dev), std::move(params)) + , m_semaphore(semaphore) + {} + ~CVulkanSemaphore(); uint64_t getCounterValue() const override; diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 4ddb828f39..de6bc6b880 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -164,7 +164,7 @@ bool IGPUCommandBuffer::reset(const core::bitflag flags) bool IGPUCommandBuffer::end() { const bool whollyInsideRenderpass = m_recordingFlags.hasFlags(USAGE::RENDER_PASS_CONTINUE_BIT); - if (!checkStateBeforeRecording(whollyInsideRenderpass ? queue_flags_t::GRAPHICS_BIT:queue_flags_t::NONE,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE)) + if (!checkStateBeforeRecording(whollyInsideRenderpass ? queue_flags_t::GRAPHICS_BIT:~queue_flags_t::NONE,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE)) return false; m_state = STATE::EXECUTABLE; @@ -302,7 +302,7 @@ bool IGPUCommandBuffer::waitEvents(const uint32_t eventCount, IEvent* const* con bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) { - if (!checkStateBeforeRecording(/*everything is allowed*/)) + if (!checkStateBeforeRecording(~queue_flags_t::NONE)) return false; if (depInfo.memBarrierCount==0u && depInfo.bufBarrierCount==0u && depInfo.imgBarrierCount==0u) From bd32f3617632811af2028abf273bf59bcdb6d060 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 4 Jan 2024 18:45:34 +0300 Subject: [PATCH 09/62] point jitify to the right hash --- 3rdparty/jitify | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/jitify b/3rdparty/jitify index 0d6dbd8ccd..1a0ca0e837 160000 --- a/3rdparty/jitify +++ b/3rdparty/jitify @@ -1 +1 @@ -Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a +Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96 From b1c5a46ba6e340945cf8e52215aed54dd1418ae8 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Fri, 5 Jan 2024 01:02:16 +0300 Subject: [PATCH 10/62] update examples && use non KHR version of vk functions --- examples_tests | 2 +- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index 6ce21d5c5c..4159025751 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 6ce21d5c5c8026b6772f3e60e21096ee54353a81 +Subproject commit 415902575143a28cba08d677c73f1e917f3367cc diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 752645f633..002dad3ae7 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -99,7 +99,7 @@ auto CVulkanLogicalDevice::waitForSemaphores(const std::span Date: Thu, 4 Jan 2024 12:03:52 +0100 Subject: [PATCH 11/62] correct bad validations, KHR instead of coe func usage etc. --- src/nbl/video/CVulkanCommandBuffer.cpp | 2 +- src/nbl/video/CVulkanQueue.cpp | 2 +- src/nbl/video/IGPUCommandBuffer.cpp | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index ed2e3e0fab..af090c92c3 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -431,7 +431,7 @@ bool CVulkanCommandBuffer::bindDescriptorSets_impl(const asset::E_PIPELINE_BIND_ } } // with K slots you need at most (K+1)/2 calls - assert(bindCallsCount < (IGPUPipelineLayout::DESCRIPTOR_SET_COUNT-1)/2); + assert(bindCallsCount <= (IGPUPipelineLayout::DESCRIPTOR_SET_COUNT+1)/2); return true; } diff --git a/src/nbl/video/CVulkanQueue.cpp b/src/nbl/video/CVulkanQueue.cpp index f94a4a8d7d..65c85239a5 100644 --- a/src/nbl/video/CVulkanQueue.cpp +++ b/src/nbl/video/CVulkanQueue.cpp @@ -60,7 +60,7 @@ auto CVulkanQueue::submit_impl(const std::span _submi core::vector submits(_submits.size(),{VK_STRUCTURE_TYPE_SUBMIT_INFO_2_KHR,/*No interesting extensions*/nullptr,/*No protected stuff yet*/0}); core::vector waitSemaphores(waitSemCnt); core::vector commandBuffers(cmdBufCnt,{VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO_KHR,nullptr}); - core::vector signalSemaphores(waitSemCnt); + core::vector signalSemaphores(signalSemCnt); auto outSubmitInfo = submits.data(); auto outWaitSemaphoreInfo = waitSemaphores.data(); diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index de6bc6b880..2ede1f2c0f 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -164,7 +164,10 @@ bool IGPUCommandBuffer::reset(const core::bitflag flags) bool IGPUCommandBuffer::end() { const bool whollyInsideRenderpass = m_recordingFlags.hasFlags(USAGE::RENDER_PASS_CONTINUE_BIT); - if (!checkStateBeforeRecording(whollyInsideRenderpass ? queue_flags_t::GRAPHICS_BIT:~queue_flags_t::NONE,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE)) + auto allowedQueueCaps = queue_flags_t::GRAPHICS_BIT; + if (!whollyInsideRenderpass) + allowedQueueCaps |= queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT; + if (!checkStateBeforeRecording(allowedQueueCaps,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE)) return false; m_state = STATE::EXECUTABLE; From 725a984ecb2ae675ac04ff9153fa68860e8938fb Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 4 Jan 2024 12:09:55 +0100 Subject: [PATCH 12/62] revert a dangerous api change --- include/nbl/video/IDeviceMemoryAllocation.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 7074f8861b..64529858ec 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -141,14 +141,14 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted size_t offset = 0ull; size_t length = 0ull; }; - inline bool map(const MemoryRange& range, const core::bitflag accessHint=IDeviceMemoryAllocation::EMCAF_READ_AND_WRITE) + inline void* map(const MemoryRange& range, const core::bitflag accessHint=IDeviceMemoryAllocation::EMCAF_READ_AND_WRITE) { if (isCurrentlyMapped()) - return false; + return nullptr; if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) - return false; + return nullptr; if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) - return false; + return nullptr; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) m_mappedPtr -= range.offset; From d2c9382f56af5f7b44541d5f2ab229fcdde0ad49 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 4 Jan 2024 21:21:42 +0100 Subject: [PATCH 13/62] update examples_tests --- 3rdparty/dxc/dxc | 2 +- examples_tests | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 024c8a9a34..79bf3aa07d 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 024c8a9a349dc45f5b4818c413502e0a45f5d542 +Subproject commit 79bf3aa07d0e603aded9d93b23bf5930d75dd539 diff --git a/examples_tests b/examples_tests index 4159025751..138356a4a5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 415902575143a28cba08d677c73f1e917f3367cc +Subproject commit 138356a4a5e277859c9d156967c0187e45ca8e49 From 2d24604ba86af24a38c65d69ed4d05fef12f2519 Mon Sep 17 00:00:00 2001 From: Przemek Date: Fri, 5 Jan 2024 17:08:16 +0100 Subject: [PATCH 14/62] Disabled CSPIRVIntrospector --- include/nbl/asset/utils/CSPIRVIntrospector.h | 75 +++++++++++--------- src/nbl/asset/utils/CSPIRVIntrospector.cpp | 3 + 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h index 926343a1f1..77ad1a4799 100644 --- a/include/nbl/asset/utils/CSPIRVIntrospector.h +++ b/include/nbl/asset/utils/CSPIRVIntrospector.h @@ -3,7 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #ifndef _NBL_ASSET_C_SPIRV_INTROSPECTOR_H_INCLUDED_ #define _NBL_ASSET_C_SPIRV_INTROSPECTOR_H_INCLUDED_ - +#if 0 #include "nbl/core/declarations.h" #include @@ -27,6 +27,20 @@ namespace spirv_cross struct SPIRType; } +// podzielic CIntrospectionData na dwie klasy +// jedna bez inputOutput i bez push constant blocka `CIntrospectionData` +// druga dziedziczy z pierwszej i dodaje te 2 rzeczy `CStageIntrospectionData` + +// wszystkie struktury w CIntrospecionData powininny u¿ywaæ bit flagi, ozaczaj¹cej shader stage (core::unordered_map) +// CStageIntrospecionData nie powinien u¿ywaæ bit flagi, ozaczaj¹cej shader stage (core::vector) + +// hashowane s¹ tylko set i binding +// dla spec constant tylko specConstantID +// validacja kolizji (dla SSpecConstants mo¿e siê jedynie ró¿niæ name) +// ogarn¹æ sytuacje gdy jeden descriptor binding ma wiêcej arrayElementCount ni¿ w SPIR-V +// w `CStageIntrospectionData` powinien byæ trzymana struktura `SIntrospectionParams` + +// namespace nbl::asset { class NBL_API2 CSPIRVIntrospector : public core::Uncopyable @@ -88,6 +102,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable //! Sorted by `location` core::vector inputOutput; + //! Push constants uniform block struct { bool present; core::string name; @@ -123,7 +138,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable return false; if (cpuShader->getContent()->getSize() != rhs.cpuShader->getContent()->getSize()) return false; - return memcmp(cpuShader->getContent()->getPointer(), rhs.cpuShader->getContent()->getPointer(), cpuShader->getContent()->getSize()) == 0;; + return memcmp(cpuShader->getContent()->getPointer(), rhs.cpuShader->getContent()->getPointer(), cpuShader->getContent()->getSize()) == 0; } }; @@ -132,43 +147,26 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable //! params.cpuShader.contentType should be ECT_SPIRV //! the compiled SPIRV must be compiled with IShaderCompiler::SCompilerOptions::debugInfoFlags enabling EDIF_SOURCE_BIT implicitly or explicitly, with no `spirvOptimizer` used in order to include names in introspection data + // powinna zwracac CStageIntrospectionData core::smart_refctd_ptr introspect(const SIntrospectionParams& params, bool insertToCache = true); + // + //core::smart_refctd_ptr merge(const std::span& asdf, const ICPUShader::SSPecInfo::spec_constant_map_t& = {}); + // When the methods take a span of shaders, they are computing things for an imaginary pipeline that includes **all** of them + // przeniesc do CIntrospectionData std::pair::E_TYPE> getImageInfoFromIntrospection(uint32_t set, uint32_t binding, const std::span _infos); - - inline core::smart_refctd_dynamic_array createPushConstantRangesFromIntrospection(const std::span _infos) - { - core::vector> introspections(_infos.size()); - if (!introspectAllShaders(introspections.data(),_infos)) - return nullptr; - - return createPushConstantRangesFromIntrospection_impl(introspections.data(),_infos); - } - inline core::smart_refctd_ptr createApproximateDescriptorSetLayoutFromIntrospection(uint32_t set, const std::span _infos) - { - core::vector> introspections(_infos.size()); - if (!introspectAllShaders(introspections.data(),_infos)) - return nullptr; - - return createApproximateDescriptorSetLayoutFromIntrospection_impl(set,introspections.data(), _infos); - } - inline core::smart_refctd_ptr createApproximatePipelineLayoutFromIntrospection(const std::span _infos) - { - core::vector> introspections(_infos.size()); - if (!introspectAllShaders(introspections.data(),_infos)) - return nullptr; - - return createApproximatePipelineLayoutFromIntrospection_impl(introspections.data(),_infos); - } // inline core::smart_refctd_ptr createApproximateComputePipelineFromIntrospection(const ICPUShader::SSpecInfo& info) + //TODO: inline core::smart_refctd_ptr createApproximateComputePipelineFromIntrospection(CStageIntrospectionData* asdf) { if (info.shader->getStage()!=IShader::ESS_COMPUTE) return nullptr; core::smart_refctd_ptr introspection = nullptr; + + //TODO: zamiast tego mergujemy `CStageIntrospectionData` w `CIntrospectionData` u¿ywaj¹c `merge` if (!introspectAllShaders(&introspection,{&info,1})) return nullptr; @@ -180,15 +178,24 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable // core::smart_refctd_ptr createApproximateRenderpassIndependentPipelineFromIntrospection(const std::span _infos); + + struct CShaderStages + { + const CStageIntrospectionData* vertex = nullptr; + const CStageIntrospectionData* fragment = nullptr; + const CStageIntrospectionData* control = nullptr; + const CStageIntrospectionData* evaluation = nullptr; + const CStageIntrospectionData* geometry = nullptr; + } + core::smart_refctd_ptr createApproximateGraphicsPipeline(const CShaderStages& shaderStages); private: - using mapId2SpecConst_t = core::unordered_map; + //TODO: przenieœæ jako members do CIntrospectionData + core::smart_refctd_dynamic_array createPushConstantRangesFromIntrospection_impl(); + core::smart_refctd_ptr createApproximateDescriptorSetLayoutFromIntrospection_impl(const uint32_t setID); + core::smart_refctd_ptr createApproximatePipelineLayoutFromIntrospection_impl(); - core::smart_refctd_dynamic_array createPushConstantRangesFromIntrospection_impl(core::smart_refctd_ptr* const introspections, const std::span shaders); - core::smart_refctd_ptr createApproximateDescriptorSetLayoutFromIntrospection_impl(uint32_t _set, core::smart_refctd_ptr* const introspections, const std::span shaders); - core::smart_refctd_ptr createApproximatePipelineLayoutFromIntrospection_impl(core::smart_refctd_ptr* const introspections, const std::span shaders); - - bool introspectAllShaders(core::smart_refctd_ptr* introspection, const std::span _infos); + core::smart_refctd_ptr introspectShader(const ICPUShader::SSpecInfo _infos); core::smart_refctd_ptr doIntrospection(spirv_cross::Compiler& _comp, const std::string& entryPoint, const IShader::E_SHADER_STAGE stage) const; void shaderMemBlockIntrospection(spirv_cross::Compiler& _comp, impl::SShaderMemoryBlock& _res, uint32_t _blockBaseTypeID, uint32_t _varID, const mapId2SpecConst_t& _sortedId2sconst) const; @@ -213,9 +220,11 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable }; using ParamsToDataMap = core::unordered_map, KeyHasher>; + // using ParamsToDataMap = core::unordered_set, KeyHasher, KeyEquals>; ParamsToDataMap m_introspectionCache; }; } // nbl::asset #endif +#endif diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp index 4378661ba0..ed4b9e3634 100644 --- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp +++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#if 0 #include "nbl/asset/utils/CSPIRVIntrospector.h" #include "nbl/asset/utils/spvUtils.h" @@ -798,3 +799,5 @@ CSPIRVIntrospector::CIntrospectionData::~CIntrospectionData() } // nbl:asset + +#endif From 2114e50dabf7b89410c2a753df806070a9a31b1e Mon Sep 17 00:00:00 2001 From: Przemek Date: Fri, 5 Jan 2024 17:59:20 +0100 Subject: [PATCH 15/62] small fixes --- include/nbl/video/IPhysicalDevice.h | 10 +++++----- src/nbl/CMakeLists.txt | 10 ++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index f29bf4938d..583c8ac9d0 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -559,11 +559,11 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable if (videoDecodeDPB && !other.videoDecodeDPB) return false; if (videoEncodeInput && !other.videoEncodeInput) return false; if (videoEncodeDPB && !other.videoEncodeDPB) return false; - if (other.storageImageLoadWithoutFormat && !storageImageLoadWithoutFormat) return false; - if (other.storageImageStoreWithoutFormat && !storageImageStoreWithoutFormat) return false; - if (other.depthCompareSampledImage && !depthCompareSampledImage) return false; - if (other.hostImageTransfer && !hostImageTransfer) return false; - if (other.log2MaxSamples < log2MaxSamples) return false; + if (storageImageLoadWithoutFormat && !other.storageImageLoadWithoutFormat) return false; + if (storageImageStoreWithoutFormat && !other.storageImageStoreWithoutFormat) return false; + if (depthCompareSampledImage && !other.depthCompareSampledImage) return false; + if (hostImageTransfer && !other.hostImageTransfer) return false; + if (log2MaxSamples > other.log2MaxSamples) return false; return true; } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index dc9f4e7bef..8297a09692 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -83,15 +83,13 @@ include(common) #[[ Loaders and writers compile options available to edit by user All revelant _NBL_COMPILE_WITH will be there]] -option(_NBL_COMPILE_WITH_MTL_LOADER_ "Compile with MTL Loader" ON) -option(_NBL_COMPILE_WITH_OBJ_LOADER_ "Compile with OBJ Loader" ON) +option(_NBL_COMPILE_WITH_MTL_LOADER_ "Compile with MTL Loader" OFF) #default off until Material Compiler 2 +option(_NBL_COMPILE_WITH_OBJ_LOADER_ "Compile with OBJ Loader" OFF) #default off until Material Compiler 2 #option(_NBL_COMPILE_WITH_OBJ_WRITER_ "Compile with OBJ Writer" ON) uncomment when writer exists -option(_NBL_COMPILE_WITH_STL_LOADER_ "Compile with STL Loader" ON) +option(_NBL_COMPILE_WITH_STL_LOADER_ "Compile with STL Loader" OFF) #default off until Material Compiler 2 option(_NBL_COMPILE_WITH_STL_WRITER_ "Compile with STL Writer" ON) -option(_NBL_COMPILE_WITH_PLY_LOADER_ "Compile with PLY Loader" ON) +option(_NBL_COMPILE_WITH_PLY_LOADER_ "Compile with PLY Loader" OFF) #default off until Material Compiler 2 option(_NBL_COMPILE_WITH_PLY_WRITER_ "Compile with PLY Writer" ON) -option(_NBL_COMPILE_WITH_BAW_LOADER_ "Compile with BAW Loader" OFF) -option(_NBL_COMPILE_WITH_BAW_WRITER_ "Compile with BAW Writer" OFF) option(_NBL_COMPILE_WITH_JPG_LOADER_ "Compile with JPG Loader" ON) option(_NBL_COMPILE_WITH_JPG_WRITER_ "Compile with JPG Writer" ON) option(_NBL_COMPILE_WITH_PNG_LOADER_ "Compile with PNG Loader" ON) From f6320ce3eeebd1684e3212b6f7f3997063968cde Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 6 Jan 2024 21:44:01 +0100 Subject: [PATCH 16/62] remove unused cruft --- include/nbl/asset/IAssetManager.h | 4 +- include/nbl/core/SingleEventHandler.h | 189 ------------------ include/nbl/core/declarations.h | 1 - .../nbl/video/utilities/ICommandPoolCache.h | 10 +- src/nbl/video/utilities/ICommandPoolCache.cpp | 6 +- 5 files changed, 9 insertions(+), 201 deletions(-) delete mode 100644 include/nbl/core/SingleEventHandler.h diff --git a/include/nbl/asset/IAssetManager.h b/include/nbl/asset/IAssetManager.h index f5c49e264b..572acfa2b3 100644 --- a/include/nbl/asset/IAssetManager.h +++ b/include/nbl/asset/IAssetManager.h @@ -45,7 +45,7 @@ std::function makeAssetDisposeFunc(const IAssetManager* con @see IAsset */ -class NBL_API2 IAssetManager : public core::IReferenceCounted, public core::QuitSignalling +class NBL_API2 IAssetManager : public core::IReferenceCounted { // the point of those functions is that lambdas returned by them "inherits" friendship friend std::function makeAssetGreetFunc(const IAssetManager* const _mgr); @@ -150,8 +150,6 @@ class NBL_API2 IAssetManager : public core::IReferenceCounted, public core::Quit protected: virtual ~IAssetManager() { - quitEventHandler.execute(); - for (size_t i = 0u; i < m_assetCache.size(); ++i) if (m_assetCache[i]) delete m_assetCache[i]; diff --git a/include/nbl/core/SingleEventHandler.h b/include/nbl/core/SingleEventHandler.h deleted file mode 100644 index 2bd7440c31..0000000000 --- a/include/nbl/core/SingleEventHandler.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_CORE_CORE_SINGLE_EVENT_HANDLER_H__ -#define __NBL_CORE_CORE_SINGLE_EVENT_HANDLER_H__ - - -#include "nbl/core/decl/Types.h" - -namespace nbl::core -{ - -// TODO: actually implement and test -//#define NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY -class SingleEventHandler -{ - public: - using Function = std::function; - - protected: - using FunctionContainerType = core::forward_list; - using FunctionContainerIt = typename FunctionContainerType::iterator; - - bool mExecuteOnDestroy; - uint32_t mFunctionsCount; - FunctionContainerType mFunctions; - FunctionContainerIt mLastFunction; -#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY - // returns prev and - inline std::pair findFunction(const Function& function) - { - auto prev = mFunctions.before_begin(); - auto curr = mFunctions.begin(); - while (prev!=mLastFunction) - { - if (*curr==function) - break; - prev = curr++; - } - return {prev,curr}; - } -#endif - public: - SingleEventHandler(bool executeEventsOnDestroy) : mExecuteOnDestroy(executeEventsOnDestroy), mFunctionsCount(0u) - { - mLastFunction = mFunctions.before_begin(); - } - - virtual ~SingleEventHandler() - { - if (mExecuteOnDestroy) - for (auto& func : mFunctions) - func(); - } - - // - inline auto getFunctionCount() const { return mFunctionsCount; } - - // - inline void registerFunction(Function&& function) - { - mLastFunction = mFunctions.emplace_after(mLastFunction,std::forward(function)); - mFunctionsCount++; - } -#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY - // no comparison operator for std::function<> so no find - //! does not call the operator() - inline void deregisterFunction(const Function& function) - { - auto found = findFunction(function); - if (found.first!=mLastFunction) - { - if (found.second==mLastFunction) - mLastFunction = found.first; - mFunctions.erase_after(found.first); - } - } - - inline void swapFunctions(const Function& oldFunction, Function&& newFunction) - { - auto found = findFunction(oldFunction); - if (found.second!=mFunctions.end()) - found.second->swap(newFunction); - } -#endif - // - inline void execute() - { - for (auto& func : mFunctions) - func(); - mFunctionsCount = 0u; - mFunctions.clear(); - mLastFunction = mFunctions.before_begin(); - } -}; - -// -class QuitSignalling -{ - public: - inline void registerOnQuit(SingleEventHandler::Function&& function) - { - quitEventHandler.registerFunction(std::move(function)); - } -#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY - //! does not call the operator() - inline void deregisterOnQuit(const SingleEventHandler::Function& function) - { - quitEventHandler.deregisterFunction(function); - } -#endif - protected: - QuitSignalling() : quitEventHandler(false) {} - virtual ~QuitSignalling() {assert(!quitEventHandler.getFunctionCount());} - - SingleEventHandler quitEventHandler; -}; - -#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY -// -template -class FactoryAndStaticSafeST -{ - T data = {}; - QuitSignalling* factory = nullptr; - - protected: - virtual void preemptiveDestruction() - { - data = T(); - factory = nullptr; - } - - public: - FactoryAndStaticSafeST() = default; - ~FactoryAndStaticSafeST() - { - assert(!factory); - } - - T& getData(QuitSignalling* _factory) - { - if (_factory!=factory) - { - std::function func(preemptiveDestruction); - if (factory) - factory->deregisterOnQuit(func); - _factory->registerOnQuit(std::move(func)); - factory = _factory; - } - return data; - } -}; - -// -template -class FactoryAndStaticSafeMT : protected FactoryAndStaticSafeST -{ - static_assert(std::is_standard_layout::value, "Lock class is not standard layout"); - Lockable lock; - - protected: - inline void preemptiveDestruction() override - { - lock.lock(); - FactoryAndStaticSafeST::preemptiveDestruction(); - lock.unlock(); - } - - public: - FactoryAndStaticSafeMT() = default; - ~FactoryAndStaticSafeMT() {} - - std::pair> getData(QuitSignalling* _factory) - { - std::unique_lock lockFirst(lock); - return {FactoryAndStaticSafeST::getData(),std::move(lockFirst)}; - } -}; -#endif - -} - -#endif - - - - diff --git a/include/nbl/core/declarations.h b/include/nbl/core/declarations.h index ea5f4167c2..fa9ebe2b18 100644 --- a/include/nbl/core/declarations.h +++ b/include/nbl/core/declarations.h @@ -66,7 +66,6 @@ #include "nbl/core/util/to_underlying.h" // other useful things -#include "nbl/core/SingleEventHandler.h" #include "nbl/core/EventDeferredHandler.h" #include "nbl/core/IBuffer.h" #include "nbl/core/IReferenceCounted.h" diff --git a/include/nbl/video/utilities/ICommandPoolCache.h b/include/nbl/video/utilities/ICommandPoolCache.h index 2d2e8b8df2..f86ebde930 100644 --- a/include/nbl/video/utilities/ICommandPoolCache.h +++ b/include/nbl/video/utilities/ICommandPoolCache.h @@ -13,13 +13,12 @@ namespace nbl::video { -#if 0 // TODO: port class ICommandPoolCache : public core::IReferenceCounted { public: using CommandPoolAllocator = core::PoolAddressAllocatorST; - NBL_API2 ICommandPoolCache(ILogicalDevice* device, const uint32_t queueFamilyIx, const IGPUCommandPool::CREATE_FLAGS _flags, const uint32_t capacity); + NBL_API2 ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag _flags, const uint32_t capacity); // inline uint32_t getCapacity() const {return m_cmdPoolAllocator.get_total_size();} @@ -33,6 +32,7 @@ class ICommandPoolCache : public core::IReferenceCounted return nullptr; } +#if 0 // TODO: port // inline uint32_t acquirePool() { @@ -106,12 +106,13 @@ class ICommandPoolCache : public core::IReferenceCounted NBL_API2 void operator()(); }; +#endif protected: friend class DeferredCommandPoolResetter; inline virtual ~ICommandPoolCache() { - m_deferredResets.cullEvents(0u); +// m_deferredResets.cullEvents(0u); free(m_reserved); delete[] m_cache; } @@ -121,9 +122,8 @@ class ICommandPoolCache : public core::IReferenceCounted core::smart_refctd_ptr* m_cache; void* m_reserved; CommandPoolAllocator m_cmdPoolAllocator; - GPUDeferredEventHandlerST m_deferredResets; +// GPUDeferredEventHandlerST m_deferredResets; }; -#endif } diff --git a/src/nbl/video/utilities/ICommandPoolCache.cpp b/src/nbl/video/utilities/ICommandPoolCache.cpp index e635911fdb..4c38fb5dec 100644 --- a/src/nbl/video/utilities/ICommandPoolCache.cpp +++ b/src/nbl/video/utilities/ICommandPoolCache.cpp @@ -6,9 +6,8 @@ using namespace nbl; using namespace video; -#if 0 // TODO: port -ICommandPoolCache::ICommandPoolCache(ILogicalDevice* device, const uint32_t queueFamilyIx, const ICommandPool::CREATE_FLAGS _flags, const uint32_t capacity) - : m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u), m_deferredResets() +ICommandPoolCache::ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag _flags, const uint32_t capacity) + : m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u)//, m_deferredResets() { m_cache = new core::smart_refctd_ptr[capacity]; for (auto i=0u; i Date: Sun, 7 Jan 2024 01:08:41 +0100 Subject: [PATCH 17/62] draft --- include/nbl/video/ISemaphore.h | 229 ++++++++++++++++++++++++++++++++- src/nbl/CMakeLists.txt | 1 + src/nbl/video/IGPUFence.cpp | 18 --- src/nbl/video/ISemaphore.cpp | 21 +++ 4 files changed, 250 insertions(+), 19 deletions(-) delete mode 100644 src/nbl/video/IGPUFence.cpp create mode 100644 src/nbl/video/ISemaphore.cpp diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 768fe1a66d..86c112e555 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -70,6 +70,233 @@ class ISemaphore : public IBackendObject SCreationParams m_creationParams; }; -} +class NBL_API2 TimelineEventHandlerBase : core::Unmovable, core::Uncopyable +{ + public: + // little utility + inline ISemaphore* getSemaphore() const {return m_sema.get();} + + protected: + inline TimelineEventHandlerBase(core::smart_refctd_ptr&& sema) : m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()) {} + + template + bool singleSemaphoreWait(const uint64_t value, const std::chrono::time_point& timeout_time) + { + const auto current_time = Clock::now(); + if (timeout_time>current_time && notTimedOut(value,std::chrono::duration_cast(timeout_time-current_time).count()); + return value; // we return it even on device loss or error, as to not hang up blocks for completion + return m_sema->getCounterValue(); + } + + bool notTimedOut(const uint64_t value, const uint64_t nanoseconds); + + core::smart_refctd_ptr m_sema; + uint64_t m_greatestSignal; + uint64_t m_greatestLatch; +}; + +#if 0 +// Could be quite easily made MT and relatively lockless, if only had a good lock-poor circular buffer impl +template +class TimelineEventHandlerST final : public TimelineEventHandlerBase +{ + constexpr static inline bool ReturnsBool = std::is_same_v()()),bool>; + struct FunctorValuePair + { + Functor func; + uint64_t geSemaValue; + }; + // could be a circular buffer but whatever + core::deque m_cb; + + inline uint32_t resetLatch() + { + m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue; + return m_cb.size(); + } + + public: + inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) : + TimelineEventHandlerBase(std::move(sema)), m_cb(initialCapacity) + { + resetLatch(); + } + ~TimelineEventHandlerST() + { + while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {} + } + + inline uint32_t count() const {return m_cb.size();} + + // You can latch arbitrary functors upon the semaphore reaching some value + inline void latch(const uint64_t geSemaValue, Functor&& function) + { + //const auto oldValue = core::atomic_fetch_max(&m_greatestLatch,geSemaValue); + assert(geSemaValue>=m_greatestLatch); // you cannot latch out of order + m_greatestLatch = geSemaValue; + m_cb.emplace_back(std::move(function),geSemaValue); + } + + // Returns number of events still outstanding + inline uint32_t poll(bool& bailed) + { + m_greatestSignal = m_sema->getCounterValue(); + // in a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal + while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal) + { + bailed = false; + if constexpr (ReturnsBool) + bailed = m_cb.front().func(); + m_cb.pop_front(); + if (bailed) + break; + } + return resetLatch(); + } + inline uint32_t poll() + { + bool dummy; + return poll(dummy); + } + + template + inline uint32_t wait(const std::chrono::time_point& timeout_time) + { + if constexpr (ReturnsBool) + { + // Perf-assumption: there are no latched events with wait values less or equal to m_greatestSignal + // So we have a bunch of events with semaphore values between m_greatestSignal and m_greatestLatch +#if 0 + for (std::chrono::time_point currentClockTime; (currentClockTime = Clock::now()) < timeout_time; ) + while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal) + { + const bool bail = m_cb.front().func(); + m_cb.pop_front(); + if (bail) + return resetLatch(); + } +#endif + } + else + { + m_greatestSignal = singleSemaphoreWait(m_greatestLatch,timeout_time); + while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal) + { + m_cb.front().func(); + m_cb.pop_front(); + } + } + return resetLatch(); + } + // The default behaviour of the underlying event handler is to wait for all events in its destructor. + // This will naturally cause you problems if you add functions latched on values you never signal, + // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock. + inline uint32_t abortOldest(const uint64_t upTo=~0ull) + { + m_greatestSignal = m_sema->getCounterValue(); + while (!m_cb.empty() && m_cb.front().geSemaValue<=upTo) + { + // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady()) + if (m_cb.front().geSemaValue<= m_greatestSignal) + m_cb.front().func(); + m_cb.pop_front(); + } + return resetLatch(); + } + inline uint32_t abortLatest(const uint64_t from=0ull) + { + m_greatestSignal = m_sema->getCounterValue(); + while (!m_cb.empty() && m_cb.back().geSemaValue>=from) + { + // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady()) + if (m_cb.back().geSemaValue<= m_greatestSignal) + m_cb.back().func(); + m_cb.pop_back(); + } + return resetLatch(); + } +}; + +template +class MultiTimelineEventHandlerST final +{ + public: + inline ~MultiTimelineEventHandlerST() + { + for (auto p : m_timelines) + delete p; + } + + inline const auto& getTimelines() const {return m_timelines;} + + // all the members are counteparts of the single timeline version + inline uint32_t count() const + { + uint32_t sum = 0; + for (auto p : m_timelines) + sum += p->count(); + return sum; + } + + inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function) + { + auto found = m_timelines.find(sema); + if (found==m_timelines.end()) + found m_timelines.insert(found,new TimelineEventHandlerST(core::smart_refctd_ptr(sema))); + assert((*found)->getSemaphore()==sema); + found->latch(sema,geValue,std::move(function)); + } + + inline uint32_t poll() + { + uint32_t sum = 0; + for (auto p : m_timelines) + { + bool bailed; + p->poll(bailed); + if (bailed) + break; + } + return sum; + } + template + inline uint32_t wait(const std::chrono::time_point& timeout_time) + { + // want to give each event equal wait time, so interpolate (albeit weirdly) + return 455; + } + + inline uint32_t abortOldest(const uint64_t upTo=~0ull) + { + uint32_t sum = 0; + for (auto p : m_timelines) + sum += p->abortOldest(upTo); + return sum; + } + inline uint32_t abortLatest(const uint64_t from=0ull) + { + uint32_t sum = 0; + for (auto p : m_timelines) + sum += p->abortLatest(from); + return sum; + } + + private: + struct Compare + { + inline bool operator()(const TimelineEventHandlerST* lhs, const TimelineEventHandlerST* rhs) const + { + return lhs->getSemaphore()getSemaphore(); + } + inline bool operator()(const TimelineEventHandlerST* lhs, const ISemaphore* rhs) const + { + return lhs->getSemaphore() m_timelines; +}; +#endif + +} #endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 8297a09692..7c7719052e 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -275,6 +275,7 @@ set(NBL_VIDEO_SOURCES ${NBL_ROOT_PATH}/src/nbl/video/ILogicalDevice.cpp ${NBL_ROOT_PATH}/src/nbl/video/IGPUAccelerationStructure.cpp ${NBL_ROOT_PATH}/src/nbl/video/IGPUCommandBuffer.cpp + ${NBL_ROOT_PATH}/src/nbl/video/ISemaphore.cpp ${NBL_ROOT_PATH}/src/nbl/video/IQueue.cpp ${NBL_ROOT_PATH}/src/nbl/video/IGPUDescriptorSet.cpp ${NBL_ROOT_PATH}/src/nbl/video/IDeviceMemoryAllocation.cpp diff --git a/src/nbl/video/IGPUFence.cpp b/src/nbl/video/IGPUFence.cpp deleted file mode 100644 index 8104ed3313..0000000000 --- a/src/nbl/video/IGPUFence.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include "nbl/video/IGPUFence.h" -#include "nbl/video/ILogicalDevice.h" -#include "nbl/video/IPhysicalDevice.h" - -namespace nbl::video -{ - -IGPUFence::E_STATUS GPUEventWrapper::waitFenceWrapper(IGPUFence* fence, uint64_t timeout) -{ - return mDevice->waitForFences(1u,&fence,true,timeout); -} - -IGPUFence::E_STATUS GPUEventWrapper::getFenceStatusWrapper(IGPUFence* fence) -{ - return mDevice->getFenceStatus(fence); -} - -} \ No newline at end of file diff --git a/src/nbl/video/ISemaphore.cpp b/src/nbl/video/ISemaphore.cpp new file mode 100644 index 0000000000..18eca04e5a --- /dev/null +++ b/src/nbl/video/ISemaphore.cpp @@ -0,0 +1,21 @@ +#include "nbl/video/ISemaphore.h" +#include "nbl/video/ILogicalDevice.h" + +namespace nbl::video +{ + +bool TimelineEventHandlerBase::notTimedOut(const uint64_t value, const uint64_t nanoseconds) +{ + const ILogicalDevice::SSemaphoreWaitInfo info = {.semaphore=m_sema.get(),.value=value}; + switch (const_cast(m_sema->getOriginDevice())->waitForSemaphores({&info,1},true,nanoseconds)) + { + case ILogicalDevice::WAIT_RESULT::TIMEOUT: + return false; + break; + default: + break; + } + return true; +} + +} \ No newline at end of file From ad1e6ffdde69a2149836a9f40cbb0417890f6d31 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 16:45:10 +0100 Subject: [PATCH 18/62] move the TimelineEventHandlers to their own header, simplifying everything, also move the Semaphore wait structures and codes to ISemaphore --- include/nbl/video/ILogicalDevice.h | 26 +- include/nbl/video/ISemaphore.h | 246 ++-------------- include/nbl/video/TimelineEventHandlers.h | 325 ++++++++++++++++++++++ src/nbl/CMakeLists.txt | 1 - src/nbl/video/CVulkanLogicalDevice.cpp | 14 +- src/nbl/video/CVulkanLogicalDevice.h | 2 +- src/nbl/video/ISemaphore.cpp | 21 -- 7 files changed, 359 insertions(+), 276 deletions(-) create mode 100644 include/nbl/video/TimelineEventHandlers.h delete mode 100644 src/nbl/video/ISemaphore.cpp diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index d4cdc6fd99..a56c311f4e 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -147,32 +147,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual IQueue::RESULT waitIdle() const = 0; //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&& ) = 0; - // - struct SSemaphoreWaitInfo - { - const ISemaphore* semaphore; - uint64_t value; - }; - enum class WAIT_RESULT : uint8_t - { - TIMEOUT, - SUCCESS, - DEVICE_LOST, - _ERROR - }; - virtual WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) = 0; + virtual core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&&) = 0; + virtual ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) = 0; // Forever waiting variant if you're confident that the fence will eventually be signalled - inline WAIT_RESULT blockForSemaphores(const std::span infos, const bool waitAll=true) + inline ISemaphore::WAIT_RESULT blockForSemaphores(const std::span infos, const bool waitAll=true) { + using retval_t = ISemaphore::WAIT_RESULT; if (!infos.empty()) { - auto waitStatus = WAIT_RESULT::TIMEOUT; - while (waitStatus==WAIT_RESULT::TIMEOUT) + auto waitStatus = retval_t::TIMEOUT; + while (waitStatus== retval_t::TIMEOUT) waitStatus = waitForSemaphores(infos,waitAll,999999999ull); return waitStatus; } - return WAIT_RESULT::SUCCESS; + return retval_t::SUCCESS; } //! Event Stuff diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 86c112e555..f16fa86baf 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -4,6 +4,8 @@ #include "nbl/core/IReferenceCounted.h" +#include + #include "nbl/video/decl/IBackendObject.h" @@ -13,6 +15,7 @@ namespace nbl::video class ISemaphore : public IBackendObject { public: + // basically a pool function virtual uint64_t getCounterValue() const = 0; //! Basically the counter can only monotonically increase with time (ergo the "timeline"): @@ -23,6 +26,21 @@ class ISemaphore : public IBackendObject // without any execution dependencies, you can only signal a value higher than 2 but less than 3 which is impossible. virtual void signal(const uint64_t value) = 0; + // We don't provide waits as part of the semaphore (cause you can await multiple at once with ILogicalDevice), + // but don't want to pollute ILogicalDevice with lots of enums and structs + struct SWaitInfo + { + const ISemaphore* semaphore; + uint64_t value; + }; + enum class WAIT_RESULT : uint8_t + { + TIMEOUT, + SUCCESS, + DEVICE_LOST, + _ERROR + }; + // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; @@ -70,233 +88,5 @@ class ISemaphore : public IBackendObject SCreationParams m_creationParams; }; -class NBL_API2 TimelineEventHandlerBase : core::Unmovable, core::Uncopyable -{ - public: - // little utility - inline ISemaphore* getSemaphore() const {return m_sema.get();} - - protected: - inline TimelineEventHandlerBase(core::smart_refctd_ptr&& sema) : m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()) {} - - template - bool singleSemaphoreWait(const uint64_t value, const std::chrono::time_point& timeout_time) - { - const auto current_time = Clock::now(); - if (timeout_time>current_time && notTimedOut(value,std::chrono::duration_cast(timeout_time-current_time).count()); - return value; // we return it even on device loss or error, as to not hang up blocks for completion - return m_sema->getCounterValue(); - } - - bool notTimedOut(const uint64_t value, const uint64_t nanoseconds); - - core::smart_refctd_ptr m_sema; - uint64_t m_greatestSignal; - uint64_t m_greatestLatch; -}; - -#if 0 -// Could be quite easily made MT and relatively lockless, if only had a good lock-poor circular buffer impl -template -class TimelineEventHandlerST final : public TimelineEventHandlerBase -{ - constexpr static inline bool ReturnsBool = std::is_same_v()()),bool>; - struct FunctorValuePair - { - Functor func; - uint64_t geSemaValue; - }; - // could be a circular buffer but whatever - core::deque m_cb; - - inline uint32_t resetLatch() - { - m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue; - return m_cb.size(); - } - - public: - inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) : - TimelineEventHandlerBase(std::move(sema)), m_cb(initialCapacity) - { - resetLatch(); - } - ~TimelineEventHandlerST() - { - while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {} - } - - inline uint32_t count() const {return m_cb.size();} - - // You can latch arbitrary functors upon the semaphore reaching some value - inline void latch(const uint64_t geSemaValue, Functor&& function) - { - //const auto oldValue = core::atomic_fetch_max(&m_greatestLatch,geSemaValue); - assert(geSemaValue>=m_greatestLatch); // you cannot latch out of order - m_greatestLatch = geSemaValue; - m_cb.emplace_back(std::move(function),geSemaValue); - } - - // Returns number of events still outstanding - inline uint32_t poll(bool& bailed) - { - m_greatestSignal = m_sema->getCounterValue(); - // in a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal - while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal) - { - bailed = false; - if constexpr (ReturnsBool) - bailed = m_cb.front().func(); - m_cb.pop_front(); - if (bailed) - break; - } - return resetLatch(); - } - inline uint32_t poll() - { - bool dummy; - return poll(dummy); - } - - template - inline uint32_t wait(const std::chrono::time_point& timeout_time) - { - if constexpr (ReturnsBool) - { - // Perf-assumption: there are no latched events with wait values less or equal to m_greatestSignal - // So we have a bunch of events with semaphore values between m_greatestSignal and m_greatestLatch -#if 0 - for (std::chrono::time_point currentClockTime; (currentClockTime = Clock::now()) < timeout_time; ) - while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal) - { - const bool bail = m_cb.front().func(); - m_cb.pop_front(); - if (bail) - return resetLatch(); - } -#endif - } - else - { - m_greatestSignal = singleSemaphoreWait(m_greatestLatch,timeout_time); - while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal) - { - m_cb.front().func(); - m_cb.pop_front(); - } - } - return resetLatch(); - } - - // The default behaviour of the underlying event handler is to wait for all events in its destructor. - // This will naturally cause you problems if you add functions latched on values you never signal, - // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock. - inline uint32_t abortOldest(const uint64_t upTo=~0ull) - { - m_greatestSignal = m_sema->getCounterValue(); - while (!m_cb.empty() && m_cb.front().geSemaValue<=upTo) - { - // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady()) - if (m_cb.front().geSemaValue<= m_greatestSignal) - m_cb.front().func(); - m_cb.pop_front(); - } - return resetLatch(); - } - inline uint32_t abortLatest(const uint64_t from=0ull) - { - m_greatestSignal = m_sema->getCounterValue(); - while (!m_cb.empty() && m_cb.back().geSemaValue>=from) - { - // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady()) - if (m_cb.back().geSemaValue<= m_greatestSignal) - m_cb.back().func(); - m_cb.pop_back(); - } - return resetLatch(); - } -}; - -template -class MultiTimelineEventHandlerST final -{ - public: - inline ~MultiTimelineEventHandlerST() - { - for (auto p : m_timelines) - delete p; - } - - inline const auto& getTimelines() const {return m_timelines;} - - // all the members are counteparts of the single timeline version - inline uint32_t count() const - { - uint32_t sum = 0; - for (auto p : m_timelines) - sum += p->count(); - return sum; - } - - inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function) - { - auto found = m_timelines.find(sema); - if (found==m_timelines.end()) - found m_timelines.insert(found,new TimelineEventHandlerST(core::smart_refctd_ptr(sema))); - assert((*found)->getSemaphore()==sema); - found->latch(sema,geValue,std::move(function)); - } - - inline uint32_t poll() - { - uint32_t sum = 0; - for (auto p : m_timelines) - { - bool bailed; - p->poll(bailed); - if (bailed) - break; - } - return sum; - } - template - inline uint32_t wait(const std::chrono::time_point& timeout_time) - { - // want to give each event equal wait time, so interpolate (albeit weirdly) - return 455; - } - - inline uint32_t abortOldest(const uint64_t upTo=~0ull) - { - uint32_t sum = 0; - for (auto p : m_timelines) - sum += p->abortOldest(upTo); - return sum; - } - inline uint32_t abortLatest(const uint64_t from=0ull) - { - uint32_t sum = 0; - for (auto p : m_timelines) - sum += p->abortLatest(from); - return sum; - } - - private: - struct Compare - { - inline bool operator()(const TimelineEventHandlerST* lhs, const TimelineEventHandlerST* rhs) const - { - return lhs->getSemaphore()getSemaphore(); - } - inline bool operator()(const TimelineEventHandlerST* lhs, const ISemaphore* rhs) const - { - return lhs->getSemaphore() m_timelines; -}; -#endif - } #endif \ No newline at end of file diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h new file mode 100644 index 0000000000..938e749622 --- /dev/null +++ b/include/nbl/video/TimelineEventHandlers.h @@ -0,0 +1,325 @@ +#ifndef _NBL_VIDEO_TIMELINE_EVENT_HANDLERS_H_INCLUDED_ +#define _NBL_VIDEO_TIMELINE_EVENT_HANDLERS_H_INCLUDED_ + + +#include "nbl/video/ILogicalDevice.h" + +#include + + +namespace nbl::video +{ + +// Could be made MT and relatively lockless, if only had a good lock-few circular buffer impl +// Not sure its worth the effort as anything using this will probably need to be lockful to be MT +template +class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable +{ + struct FunctorValuePair + { + Functor func; + uint64_t geSemaValue; + }; + // could be a circular buffer but whatever for now + core::deque m_cb; + core::smart_refctd_ptr m_sema; + uint64_t m_greatestSignal; + uint64_t m_greatestLatch; + + template + inline uint32_t for_each_popping(Lambda&& l) + { + if (m_cb.empty()) + return 0; + + if (QueryCounter) + m_greatestSignal = m_sema->getCounterValue(); + // In a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal + // the way that it would happen is we'd `reserveLock` everything in the buffer so far + // then rewind the reservation for anything that doesn't meet the predicate. + // For this to work, the predicate needs to be "consistent" meaning no holes can be formed by multiple actors. + while (!m_cb.empty() && l(m_cb.front())) + m_cb.pop_front(); + m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue; + return static_cast(m_cb.size()); + } + + inline auto constructNonBailing() + { + return [&](FunctorValuePair& p) -> bool + { + if (p.geSemaValue>m_greatestSignal) + return false; + p.func(); + return true; + }; + } + inline auto constructBailing(bool& bailed) + { + return [&](FunctorValuePair& p) -> bool + { + if (p.geSemaValue>m_greatestSignal) + return false; + const bool last_bailed = bailed; + bailed = p.func(); + return !last_bailed; + }; + } + + // If the functor returns bool, then we bail on the on the first executed event during wait,poll,etc. + constexpr static inline bool ReturnsBool = std::is_same_v()()),bool>; + + public: + // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent. + inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) : + m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {} + // If you don't want to deadlock here, look into the `abort*` family of methods + ~TimelineEventHandlerST() + { + while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {} + } + // little utility + inline ISemaphore* getSemaphore() const {return m_sema.get();} + + inline uint32_t count() const {return m_cb.size();} + + // You can latch arbitrary functors upon the semaphore reaching some value + inline void latch(const uint64_t geSemaValue, Functor&& function) + { + //const auto oldValue = core::atomic_fetch_max(&m_greatestLatch,geSemaValue); + assert(geSemaValue>=m_greatestLatch); // you cannot latch out of order + m_greatestLatch = geSemaValue; + m_cb.emplace_back(std::move(function),geSemaValue); + } + + // Returns number of events still outstanding + inline uint32_t poll(bool& bailed) + { + bailed = false; + if constexpr (ReturnsBool) + return for_each_popping(constructBailing(bailed)); + else + return for_each_popping(constructNonBailing()); + } + inline uint32_t poll() + { + bool dummy; + return poll(dummy); + } + + template + inline uint32_t wait(const std::chrono::time_point& timeout_time) + { + if (m_cb.empty()) + return 0; + + auto singleSemaphoreWait = [&](const uint64_t waitVal, const std::chrono::time_point& waitPoint)->uint64_t + { + const auto current_time = Clock::now(); + if (waitPoint>current_time) + { + auto device = const_cast(m_sema->getOriginDevice()); + const auto nanosecondsLeft = std::chrono::duration_cast(waitPoint-current_time).count(); + const ISemaphore::SWaitInfo info = {.semaphore=m_sema.get(),.value = waitVal}; + if (device->waitForSemaphores({&info,1},true,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS) + return waitVal>m_greatestSignal ? waitVal:m_greatestSignal; // remeber that latch can move back, not signal though + } + return m_sema->getCounterValue(); + }; + + if constexpr (ReturnsBool) + { + // Perf-assumption: there are probably no latched events with wait values less or equal to `m_greatestSignal` + // So we have a bunch of events with semaphore values between `m_greatestSignal` and `m_greatestLatch` with + // lots of repeated latch values incrementing by a fixed K amount between each batch of repeats + auto currentTime = Clock::now(); + do + { + // We cannot wait for the original timeout point because we want to be able to bail, so increment slowly + const auto uniqueValueEstimate = core::min(m_cb.size(),m_greatestSignal-m_greatestLatch); + // weird interpolation that works on integers, basically trying to get somethign 1/uniqueValueEstimate of the way from now to original timeout point + const std::chrono::time_point singleWaitTimePt((currentTime.time_since_epoch()*(uniqueValueEstimate-1u)+timeout_time.time_since_epoch())/uniqueValueEstimate); + // So we only Semaphore wait for the next latch value we need + m_greatestSignal = singleSemaphoreWait(m_cb.front().geSemaValue,singleWaitTimePt); + + bool bailed = false; + for_each_popping(constructBailing(bailed)); + if (bailed) + break; + } while ((currentTime=Clock::now())(constructNonBailing()); + } + } + + // The default behaviour of the underlying event handler is to wait for all events in its destructor. + // This will naturally cause you problems if you add functions latched on values you never signal, + // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock. + inline uint32_t abortOldest(const uint64_t upTo) + { + return for_each_popping([&](FunctorValuePair& p) -> bool + { + if (p.geSemaValue>upTo) + return false; + // don't want weird behaviour, so execute everything that would have been executed + // if a single `poll()` was called before `abortOldest` + if (p.geSemaValue<=m_greatestSignal) + p.func(); + return true; + } + ); + } + inline uint32_t abortLatest(const uint64_t from) + { + // We also need to run the functors in the same order they'd be ran with a single `poll()`, + // so we run all of them from the front, not just from the `from` value. + for_each_popping(constructNonBailing()); + // now kill the latest stuff + while (!m_cb.empty() && m_cb.back().geSemaValue>=from) + m_cb.pop_back(); + return m_cb.size(); + } + inline void abortAll() {abortOldest(~0ull);} +}; + +// +template +class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable +{ + public: + using TimelineEventHandler = TimelineEventHandlerST; + inline ~MultiTimelineEventHandlerST() + { + clear(); + } + + inline const auto& getTimelines() const {return m_timelines;} + + // all the members are counteparts of the single timeline version + inline uint32_t count() const + { + uint32_t sum = 0; + for (auto p : m_timelines) + sum += p->count(); + return sum; + } + + inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function) + { + auto found = m_timelines.find(sema); + if (found==m_timelines.end()) + { + STimeline newTimeline = { + .handler = new TimelineEventHandler(core::smart_refctd_ptr(sema)), + .waitInfoIx = m_scratchWaitInfos.size() + }; + found = m_timelines.insert(found,std::move(newTimeline)); + m_scratchWaitInfos.emplace_back(sema,0xdeadbeefBADC0FFEull); + } + assert(found->handler->getSemaphore()==sema); + found->handler->latch(sema,geValue,std::move(function)); + } + + inline uint32_t poll() + { + uint32_t sum = 0; + for (auto p : m_timelines) + { + bool bailed; + p->poll(bailed); + if (bailed) + break; + } + return sum; + } + +#if 0 + template + inline uint32_t wait(const std::chrono::time_point& timeout_time) + { + return 455; + } +#endif + + inline void abortAll() + { + for (auto& p : m_timelines) + p.handler->abortAll(); + clear(); + } + inline uint32_t abortOldest(const uint64_t upTo=~0ull) + { + uint32_t sum = 0; + for (auto& p : m_timelines) + sum += p.handler->abortOldest(upTo); + return sum; + } + inline uint32_t abortLatest(const uint64_t from=0ull) + { + uint32_t sum = 0; + for (auto& p : m_timelines) + sum += p.handler->abortLatest(from); + return sum; + } + + private: + struct STimeline + { + inline auto operator<=>(const STimeline& rhs) const + { + return handler->getSemaphore()-rhs.handler->getSemaphore(); + } + inline auto operator<=>(const ISemaphore* rhs) const + { + return handler->getSemaphore()-rhs; + } + + TimelineEventHandler* handler; + size_t waitInfoIx; + }; + // We use a `set<>` instead of `unordered_set<>` because we assume you won't spam semaphores/timelines + using container_t = core::set; + + template + inline uint32_t for_each_erasing(Lambda&& l) + { + uint32_t sum = 0; + // we don't check erasing when l(*it)==false on purpose, it only happens in poll and the timeline semaphore is likely to get re-added + for (auto it=m_timelines.begin(); it!=m_timelines.end() && l(*it); ) + it = it->handler->count() ? (it++):eraseTimeline(it); + return sum; + } + + inline container_t::iterator eraseTimeline(container_t::iterator timeline) + { + // if not the last in scratch + if (timeline->waitInfoIxwaitInfoIx; + m_scratchWaitInfos[timeline->waitInfoIx] = lastScratch; + } + m_scratchWaitInfos.pop_back(); + delete timeline->handler; + return m_timelines.erase(timeline); + } + + inline void clear() + { + m_scratchWaitInfos.clear(); + for (auto p : m_timelines) + delete p.handler; + m_timelines.clear(); + } + + container_t m_timelines; + core::vector m_scratchWaitInfos; +}; + +} +#endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 7c7719052e..8297a09692 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -275,7 +275,6 @@ set(NBL_VIDEO_SOURCES ${NBL_ROOT_PATH}/src/nbl/video/ILogicalDevice.cpp ${NBL_ROOT_PATH}/src/nbl/video/IGPUAccelerationStructure.cpp ${NBL_ROOT_PATH}/src/nbl/video/IGPUCommandBuffer.cpp - ${NBL_ROOT_PATH}/src/nbl/video/ISemaphore.cpp ${NBL_ROOT_PATH}/src/nbl/video/IQueue.cpp ${NBL_ROOT_PATH}/src/nbl/video/IGPUDescriptorSet.cpp ${NBL_ROOT_PATH}/src/nbl/video/IDeviceMemoryAllocation.cpp diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 002dad3ae7..0714481ac8 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -79,8 +79,10 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(ISemaph return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), semaphore, std::move(params)); } -auto CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) -> WAIT_RESULT +ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) { + using retval_t = ISemaphore::WAIT_RESULT; + core::vector semaphores(infos.size()); core::vector values(infos.size()); auto outSemaphores = semaphores.data(); @@ -89,7 +91,7 @@ auto CVulkanLogicalDevice::waitForSemaphores(const std::span(info.semaphore,this); if (!sema) - WAIT_RESULT::_ERROR; + retval_t::_ERROR; *(outSemaphores++) = sema->getInternalObject(); *(outValues++) = info.value; } @@ -102,15 +104,15 @@ auto CVulkanLogicalDevice::waitForSemaphores(const std::span CVulkanLogicalDevice::createEvent(const IEvent::CREATE_FLAGS flags) diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index d8f934ceb9..0df38ffd67 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -53,7 +53,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice } core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&&) override; - WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; + ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; diff --git a/src/nbl/video/ISemaphore.cpp b/src/nbl/video/ISemaphore.cpp deleted file mode 100644 index 18eca04e5a..0000000000 --- a/src/nbl/video/ISemaphore.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "nbl/video/ISemaphore.h" -#include "nbl/video/ILogicalDevice.h" - -namespace nbl::video -{ - -bool TimelineEventHandlerBase::notTimedOut(const uint64_t value, const uint64_t nanoseconds) -{ - const ILogicalDevice::SSemaphoreWaitInfo info = {.semaphore=m_sema.get(),.value=value}; - switch (const_cast(m_sema->getOriginDevice())->waitForSemaphores({&info,1},true,nanoseconds)) - { - case ILogicalDevice::WAIT_RESULT::TIMEOUT: - return false; - break; - default: - break; - } - return true; -} - -} \ No newline at end of file From a1afcc88ce310b354941fa923aa90b5c67d66a55 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 19:17:14 +0100 Subject: [PATCH 19/62] Made the TimelineEventHandlerST use a const ISemaphore, almost all of MultiTimelineEventHandlerST is implemented --- include/nbl/video/TimelineEventHandlers.h | 215 ++++++++++++++-------- 1 file changed, 134 insertions(+), 81 deletions(-) diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 938e749622..865d3b6469 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -9,12 +9,16 @@ namespace nbl::video { +template +class MultiTimelineEventHandlerST; // Could be made MT and relatively lockless, if only had a good lock-few circular buffer impl // Not sure its worth the effort as anything using this will probably need to be lockful to be MT template class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable { + friend MultiTimelineEventHandlerST; + struct FunctorValuePair { Functor func; @@ -22,10 +26,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable }; // could be a circular buffer but whatever for now core::deque m_cb; - core::smart_refctd_ptr m_sema; + core::smart_refctd_ptr m_sema; uint64_t m_greatestSignal; uint64_t m_greatestLatch; - + template inline uint32_t for_each_popping(Lambda&& l) { @@ -44,34 +48,33 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable return static_cast(m_cb.size()); } - inline auto constructNonBailing() + template + inline auto constructNonBailing(Args&&... args) { return [&](FunctorValuePair& p) -> bool { if (p.geSemaValue>m_greatestSignal) return false; - p.func(); + p.func(std::forward(args)...); return true; }; } - inline auto constructBailing(bool& bailed) + template + inline auto constructBailing(bool& bailed, Args&&... args) { return [&](FunctorValuePair& p) -> bool { if (p.geSemaValue>m_greatestSignal) return false; const bool last_bailed = bailed; - bailed = p.func(); + bailed = p.func(std::forward(args)...); return !last_bailed; }; } - // If the functor returns bool, then we bail on the on the first executed event during wait,poll,etc. - constexpr static inline bool ReturnsBool = std::is_same_v()()),bool>; - public: // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent. - inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) : + inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) : m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {} // If you don't want to deadlock here, look into the `abort*` family of methods ~TimelineEventHandlerST() @@ -79,7 +82,7 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {} } // little utility - inline ISemaphore* getSemaphore() const {return m_sema.get();} + inline const ISemaphore* getSemaphore() const {return m_sema.get();} inline uint32_t count() const {return m_cb.size();} @@ -92,29 +95,36 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable m_cb.emplace_back(std::move(function),geSemaValue); } - // Returns number of events still outstanding - inline uint32_t poll(bool& bailed) + // + struct PollResult { - bailed = false; + uint32_t eventsLeft = ~0u; + bool bailed = false; + }; + template + inline PollResult poll(Args&&... args) + { + PollResult retval = {}; + constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)),bool>; if constexpr (ReturnsBool) - return for_each_popping(constructBailing(bailed)); + retval.eventsLeft = for_each_popping(constructBailing(retval.bailed,std::forward(args)...)); else - return for_each_popping(constructNonBailing()); - } - inline uint32_t poll() - { - bool dummy; - return poll(dummy); + retval.eventsLeft = for_each_popping(constructNonBailing(std::forward(args)...)); + return retval; } - template - inline uint32_t wait(const std::chrono::time_point& timeout_time) + template + inline uint32_t wait(const std::chrono::time_point& timeout_time, Args&&... args) { if (m_cb.empty()) return 0; - auto singleSemaphoreWait = [&](const uint64_t waitVal, const std::chrono::time_point& waitPoint)->uint64_t + auto singleSemaphoreWait = [&](const uint64_t waitVal, const std::chrono::time_point& waitPoint) -> void { + // remeber that latch can move back, not signal though + if (waitVal<=m_greatestSignal) + return; + const auto current_time = Clock::now(); if (waitPoint>current_time) { @@ -122,11 +132,12 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable const auto nanosecondsLeft = std::chrono::duration_cast(waitPoint-current_time).count(); const ISemaphore::SWaitInfo info = {.semaphore=m_sema.get(),.value = waitVal}; if (device->waitForSemaphores({&info,1},true,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS) - return waitVal>m_greatestSignal ? waitVal:m_greatestSignal; // remeber that latch can move back, not signal though + m_greatestSignal = waitVal; } - return m_sema->getCounterValue(); + m_greatestSignal = m_sema->getCounterValue(); }; - + + constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)),bool>; if constexpr (ReturnsBool) { // Perf-assumption: there are probably no latched events with wait values less or equal to `m_greatestSignal` @@ -140,10 +151,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable // weird interpolation that works on integers, basically trying to get somethign 1/uniqueValueEstimate of the way from now to original timeout point const std::chrono::time_point singleWaitTimePt((currentTime.time_since_epoch()*(uniqueValueEstimate-1u)+timeout_time.time_since_epoch())/uniqueValueEstimate); // So we only Semaphore wait for the next latch value we need - m_greatestSignal = singleSemaphoreWait(m_cb.front().geSemaValue,singleWaitTimePt); + singleSemaphoreWait(m_cb.front().geSemaValue,singleWaitTimePt); bool bailed = false; - for_each_popping(constructBailing(bailed)); + for_each_popping(constructBailing(bailed,std::forward(args)...)); if (bailed) break; } while ((currentTime=Clock::now())(constructNonBailing()); + singleSemaphoreWait(m_greatestLatch,timeout_time); + return for_each_popping(constructNonBailing(std::forward(args)...)); } } // The default behaviour of the underlying event handler is to wait for all events in its destructor. // This will naturally cause you problems if you add functions latched on values you never signal, // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock. - inline uint32_t abortOldest(const uint64_t upTo) + template + inline uint32_t abortOldest(const uint64_t upTo, Args&&... args) { return for_each_popping([&](FunctorValuePair& p) -> bool { @@ -168,22 +180,24 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable // don't want weird behaviour, so execute everything that would have been executed // if a single `poll()` was called before `abortOldest` if (p.geSemaValue<=m_greatestSignal) - p.func(); + p.func(std::forward(args)...); return true; } ); } - inline uint32_t abortLatest(const uint64_t from) + template + inline uint32_t abortLatest(const uint64_t from, Args&&... args) { // We also need to run the functors in the same order they'd be ran with a single `poll()`, // so we run all of them from the front, not just from the `from` value. - for_each_popping(constructNonBailing()); + for_each_popping(constructNonBailing(std::forward(args)...)); // now kill the latest stuff while (!m_cb.empty() && m_cb.back().geSemaValue>=from) m_cb.pop_back(); return m_cb.size(); } - inline void abortAll() {abortOldest(~0ull);} + template + inline void abortAll(Args&&... args) {abortOldest(~0ull,std::forward(args)...);} }; // @@ -192,6 +206,8 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable { public: using TimelineEventHandler = TimelineEventHandlerST; + + inline MultiTimelineEventHandlerST(core::smart_refctd_ptr&& device) : m_device(std::move(device)) {} inline ~MultiTimelineEventHandlerST() { clear(); @@ -204,46 +220,104 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable { uint32_t sum = 0; for (auto p : m_timelines) - sum += p->count(); + sum += p.handler->count(); return sum; } - inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function) + inline bool latch(const ISemaphore::SWaitInfo& futureWait, Functor&& function) { - auto found = m_timelines.find(sema); + auto found = m_timelines.find(futureWait.semaphore); if (found==m_timelines.end()) { + if (futureWait.semaphore->getOriginDevice()!=m_device.get()) + return false; STimeline newTimeline = { - .handler = new TimelineEventHandler(core::smart_refctd_ptr(sema)), + .handler = new TimelineEventHandler(core::smart_refctd_ptr(futureWait.semaphore)), .waitInfoIx = m_scratchWaitInfos.size() }; found = m_timelines.insert(found,std::move(newTimeline)); - m_scratchWaitInfos.emplace_back(sema,0xdeadbeefBADC0FFEull); + m_scratchWaitInfos.emplace_back(futureWait.semaphore,0xdeadbeefBADC0FFEull); } - assert(found->handler->getSemaphore()==sema); - found->handler->latch(sema,geValue,std::move(function)); + assert(found->handler->getSemaphore()==futureWait.semaphore); + found->handler->latch(futureWait.value,std::move(function)); + return true; } - inline uint32_t poll() - { - uint32_t sum = 0; - for (auto p : m_timelines) + template + inline typename TimelineEventHandler::PollResult poll(Args&&... args) + { + typename TimelineEventHandler::PollResult retval = {0,false}; + for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); ) { - bool bailed; - p->poll(bailed); - if (bailed) - break; + if (!retval.bailed) + { + const auto local = it->handler->poll(); + retval.eventsLeft += local.eventsLeft; + retval.bailed = local.bailed; + } + if (it->handler->count()) + it++; + else + it = eraseTimeline(it); } - return sum; + return retval; } -#if 0 template inline uint32_t wait(const std::chrono::time_point& timeout_time) { + bool allEmpty = true; + for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); ) + { + if (it->handler->count()) + { +#if 0 + // TODO: adapt + const waitVal = it->handler->m_greatestLatch; + // need to fill all waits anyway + m_scratchWaitInfos[it->waitInfoIx].value = waitVal; + // remeber that latch can move back, not signal though + if (waitVal>it->handler->m_greatestSignal) + allEmpty = false; +#endif + it++; + } + else + it = eraseTimeline(it); + } + if (allEmpty) + return 0; + + constexpr bool ReturnsBool = false; + auto singleSemaphoreWait = [&](const std::chrono::time_point& waitPoint) -> bool + { + const auto current_time = Clock::now(); + if (waitPoint>current_time) + { + const auto nanosecondsLeft = std::chrono::duration_cast(waitPoint-current_time).count(); + if (m_device->waitForSemaphores(m_scratchWaitInfos,!ReturnsBool,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS) + return true; // remeber that latch can move back, not signal though + } + // + + return false; + }; + + if constexpr (ReturnsBool) + { + return 600; + } + else + { + if (singleSemaphoreWait(timeout_time)) + { + clear(); + return 0; + } + } + return 455; } -#endif inline void abortAll() { @@ -251,20 +325,6 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable p.handler->abortAll(); clear(); } - inline uint32_t abortOldest(const uint64_t upTo=~0ull) - { - uint32_t sum = 0; - for (auto& p : m_timelines) - sum += p.handler->abortOldest(upTo); - return sum; - } - inline uint32_t abortLatest(const uint64_t from=0ull) - { - uint32_t sum = 0; - for (auto& p : m_timelines) - sum += p.handler->abortLatest(from); - return sum; - } private: struct STimeline @@ -282,26 +342,18 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable size_t waitInfoIx; }; // We use a `set<>` instead of `unordered_set<>` because we assume you won't spam semaphores/timelines - using container_t = core::set; - - template - inline uint32_t for_each_erasing(Lambda&& l) - { - uint32_t sum = 0; - // we don't check erasing when l(*it)==false on purpose, it only happens in poll and the timeline semaphore is likely to get re-added - for (auto it=m_timelines.begin(); it!=m_timelines.end() && l(*it); ) - it = it->handler->count() ? (it++):eraseTimeline(it); - return sum; - } + // also we need to be able to continue iteration after an erasure of a single element + using container_t = core::set/*quirk of STL*/>; - inline container_t::iterator eraseTimeline(container_t::iterator timeline) + inline container_t::iterator eraseTimeline(typename container_t::iterator timeline) { // if not the last in scratch if (timeline->waitInfoIxwaitInfoIx; + typename container_t::iterator found = m_timelines.find(lastScratch.semaphore); +// found->waitInfoIx = timeline->waitInfoIx; m_scratchWaitInfos[timeline->waitInfoIx] = lastScratch; } m_scratchWaitInfos.pop_back(); @@ -319,6 +371,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable container_t m_timelines; core::vector m_scratchWaitInfos; + core::smart_refctd_ptr m_device; }; } From 262281fecc6e05bd93ac73455f91384050d156b1 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 21:31:14 +0100 Subject: [PATCH 20/62] implement MultiTimelineEventHandlerST and correct TimelineEventHandlerST --- include/nbl/video/TimelineEventHandlers.h | 269 +++++++++++++--------- 1 file changed, 159 insertions(+), 110 deletions(-) diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 865d3b6469..0f4e7015a7 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -17,64 +17,9 @@ class MultiTimelineEventHandlerST; template class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable { - friend MultiTimelineEventHandlerST; - - struct FunctorValuePair - { - Functor func; - uint64_t geSemaValue; - }; - // could be a circular buffer but whatever for now - core::deque m_cb; - core::smart_refctd_ptr m_sema; - uint64_t m_greatestSignal; - uint64_t m_greatestLatch; - - template - inline uint32_t for_each_popping(Lambda&& l) - { - if (m_cb.empty()) - return 0; - - if (QueryCounter) - m_greatestSignal = m_sema->getCounterValue(); - // In a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal - // the way that it would happen is we'd `reserveLock` everything in the buffer so far - // then rewind the reservation for anything that doesn't meet the predicate. - // For this to work, the predicate needs to be "consistent" meaning no holes can be formed by multiple actors. - while (!m_cb.empty() && l(m_cb.front())) - m_cb.pop_front(); - m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue; - return static_cast(m_cb.size()); - } - - template - inline auto constructNonBailing(Args&&... args) - { - return [&](FunctorValuePair& p) -> bool - { - if (p.geSemaValue>m_greatestSignal) - return false; - p.func(std::forward(args)...); - return true; - }; - } - template - inline auto constructBailing(bool& bailed, Args&&... args) - { - return [&](FunctorValuePair& p) -> bool - { - if (p.geSemaValue>m_greatestSignal) - return false; - const bool last_bailed = bailed; - bailed = p.func(std::forward(args)...); - return !last_bailed; - }; - } - public: // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent. - inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) : + inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) : m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {} // If you don't want to deadlock here, look into the `abort*` family of methods ~TimelineEventHandlerST() @@ -104,13 +49,7 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable template inline PollResult poll(Args&&... args) { - PollResult retval = {}; - constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)),bool>; - if constexpr (ReturnsBool) - retval.eventsLeft = for_each_popping(constructBailing(retval.bailed,std::forward(args)...)); - else - retval.eventsLeft = for_each_popping(constructNonBailing(std::forward(args)...)); - return retval; + return poll_impl(std::forward(args)...); } template @@ -132,7 +71,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable const auto nanosecondsLeft = std::chrono::duration_cast(waitPoint-current_time).count(); const ISemaphore::SWaitInfo info = {.semaphore=m_sema.get(),.value = waitVal}; if (device->waitForSemaphores({&info,1},true,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS) + { m_greatestSignal = waitVal; + return; + } } m_greatestSignal = m_sema->getCounterValue(); }; @@ -198,6 +140,74 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable } template inline void abortAll(Args&&... args) {abortOldest(~0ull,std::forward(args)...);} + + private: + friend MultiTimelineEventHandlerST; + + struct FunctorValuePair + { + Functor func; + uint64_t geSemaValue; + }; + // could be a circular buffer but whatever for now + core::deque m_cb; + core::smart_refctd_ptr m_sema; + uint64_t m_greatestSignal; + uint64_t m_greatestLatch; + + template + inline uint32_t for_each_popping(Lambda&& l) + { + if (m_cb.empty()) + return 0; + + if (QueryCounter) + m_greatestSignal = m_sema->getCounterValue(); + // In a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal + // the way that it would happen is we'd `reserveLock` everything in the buffer so far + // then rewind the reservation for anything that doesn't meet the predicate. + // For this to work, the predicate needs to be "consistent" meaning no holes can be formed by multiple actors. + while (!m_cb.empty() && l(m_cb.front())) + m_cb.pop_front(); + m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue; + return static_cast(m_cb.size()); + } + + template + inline auto constructNonBailing(Args&&... args) + { + return [&](FunctorValuePair& p) -> bool + { + if (p.geSemaValue>m_greatestSignal) + return false; + p.func(std::forward(args)...); + return true; + }; + } + template + inline auto constructBailing(bool& bailed, Args&&... args) + { + return [&](FunctorValuePair& p) -> bool + { + if (p.geSemaValue>m_greatestSignal) + return false; + const bool last_bailed = bailed; + bailed = p.func(std::forward(args)...); + return !last_bailed; + }; + } + + template + inline PollResult poll_impl(Args&&... args) + { + PollResult retval = {}; + constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)), bool>; + if constexpr (ReturnsBool) + retval.eventsLeft = for_each_popping(constructBailing(retval.bailed, std::forward(args)...)); + else + retval.eventsLeft = for_each_popping(constructNonBailing(std::forward(args)...)); + return retval; + } }; // @@ -247,11 +257,11 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable inline typename TimelineEventHandler::PollResult poll(Args&&... args) { typename TimelineEventHandler::PollResult retval = {0,false}; - for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); ) + for (auto it=m_timelines.begin(); it!=m_timelines.end(); ) { if (!retval.bailed) { - const auto local = it->handler->poll(); + const auto local = it->handler->poll(std::forward(args)...); retval.eventsLeft += local.eventsLeft; retval.bailed = local.bailed; } @@ -263,60 +273,88 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable return retval; } - template - inline uint32_t wait(const std::chrono::time_point& timeout_time) + template + inline uint32_t wait(const std::chrono::time_point& timeout_time, Args&&... args) { - bool allEmpty = true; - for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); ) - { - if (it->handler->count()) - { -#if 0 - // TODO: adapt - const waitVal = it->handler->m_greatestLatch; - // need to fill all waits anyway - m_scratchWaitInfos[it->waitInfoIx].value = waitVal; - // remeber that latch can move back, not signal though - if (waitVal>it->handler->m_greatestSignal) - allEmpty = false; -#endif - it++; - } - else - it = eraseTimeline(it); - } - if (allEmpty) - return 0; - - constexpr bool ReturnsBool = false; - auto singleSemaphoreWait = [&](const std::chrono::time_point& waitPoint) -> bool + auto nanosecondsLeft = [](const std::chrono::time_point& waitPoint)->uint64_t { const auto current_time = Clock::now(); - if (waitPoint>current_time) - { - const auto nanosecondsLeft = std::chrono::duration_cast(waitPoint-current_time).count(); - if (m_device->waitForSemaphores(m_scratchWaitInfos,!ReturnsBool,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS) - return true; // remeber that latch can move back, not signal though - } - // - - return false; + if (current_time>=waitPoint) + return 0; + return std::chrono::duration_cast(waitPoint-current_time).count(); }; - if constexpr (ReturnsBool) - { - return 600; - } - else + constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)),bool>; + constexpr bool WaitAll = !ReturnsBool; + + uint32_t sum = 0; + do { - if (singleSemaphoreWait(timeout_time)) + auto uniqueValueEstimate = 1; + // `waitsToPerform` isn't very conservative, it doesn't mean there are no latched events + // instead it means that there is no point waiting with the device on the semaphore + // because the value we're about to wait for was already attained. + bool waitsToPerform = false; + // first gather all the wait values if there's time to even perform a wait + if (nanosecondsLeft(timeout_time)) + for (auto it=m_timelines.begin(); it!=m_timelines.end(); ) { - clear(); - return 0; + // will return 0 for an empty event list + const auto waitVal = it->getWaitValue(WaitAll); + if (waitVal) + { + // need to fill all waits anyway even if its redudant + m_scratchWaitInfos[it->waitInfoIx].value = waitVal; + // remeber that latch can move back, not the signal though + if (waitVal>it->handler->m_greatestSignal) + { + uniqueValueEstimate = core::max(core::min(it->handler->m_cb.size(),it->handler->m_greatestSignal-it->handler->m_greatestLatch),uniqueValueEstimate); + waitsToPerform = true; + } + it++; + } + else + it = eraseTimeline(it); } - } - return 455; + bool allReady = false; + if (waitsToPerform) + { + const std::chrono::time_point singleWaitTimePt((Clock::now().time_since_epoch()*(uniqueValueEstimate-1u)+timeout_time.time_since_epoch())/uniqueValueEstimate); + if (const auto nano = nanosecondsLeft(WaitAll ? timeout_time:singleWaitTimePt)) + if (m_device->waitForSemaphores(m_scratchWaitInfos,WaitAll,nano)==ISemaphore::WAIT_RESULT::SUCCESS) + allReady = WaitAll || m_scratchWaitInfos.size()==1; + } + + sum = 0; + bool bailed = false; + for (auto it=m_timelines.begin(); it!=m_timelines.end(); ) + { + auto* handler = it->handler; + // only if we waited for all semaphores, we can just set their greatest signal value to the value we awaited + handler->m_greatestSignal = allReady ? it->getWaitValue(WaitAll):handler->getSemaphore()->getCounterValue(); + if (bailed) + sum += handler->count(); + else + { + const auto local = handler->poll_impl(std::forward(args)...); + bailed = local.bailed; + // if don't have any events left, remove the timeline + if (local.eventsLeft) + { + sum += local.eventsLeft; + it++; + } + // but there's a fast path at the end + else if (ReturnsBool || !allReady) + it = eraseTimeline(it); + } + } + // ultra fast path for non-bailing code when everything was covered by a single wait + if (WaitAll && allReady) + clear(); + } while (sum && Clock::now()m_cb.empty()) + return 0ull; + // following same assumptions as the single-timeline case + if (waitAll) + return handler->m_greatestLatch; + else + return handler->m_cb.front().geSemaValue; + } + inline auto operator<=>(const STimeline& rhs) const { return handler->getSemaphore()-rhs.handler->getSemaphore(); From d7690be686496be53571468c3e465424d1848496 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 22:17:37 +0100 Subject: [PATCH 21/62] fix KHR function loading bugs --- src/nbl/video/CVulkanCommandBuffer.cpp | 8 ++++---- src/nbl/video/CVulkanSemaphore.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index af090c92c3..188ca33595 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -153,13 +153,13 @@ bool CVulkanCommandBuffer::setEvent_impl(IEvent* const _event, const SEventDepen return false; auto info = fill(memoryBarriers.data(),bufferBarriers.data(),imageBarriers.data(),depInfo); - getFunctionTable().vkCmdSetEvent2KHR(m_cmdbuf,static_cast(_event)->getInternalObject(),&info); + getFunctionTable().vkCmdSetEvent2(m_cmdbuf,static_cast(_event)->getInternalObject(),&info); return true; } bool CVulkanCommandBuffer::resetEvent_impl(IEvent* const _event, const core::bitflag stageMask) { - getFunctionTable().vkCmdResetEvent2KHR(m_cmdbuf,static_cast(_event)->getInternalObject(),getVkPipelineStageFlagsFromPipelineStageFlags(stageMask)); + getFunctionTable().vkCmdResetEvent2(m_cmdbuf,static_cast(_event)->getInternalObject(),getVkPipelineStageFlagsFromPipelineStageFlags(stageMask)); return true; } @@ -196,7 +196,7 @@ bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* co bufBarrierCount += infos[i].bufferMemoryBarrierCount; imgBarrierCount += infos[i].imageMemoryBarrierCount; } - getFunctionTable().vkCmdWaitEvents2KHR(m_cmdbuf,eventCount,events.data(),infos.data()); + getFunctionTable().vkCmdWaitEvents2(m_cmdbuf,eventCount,events.data(),infos.data()); return true; } @@ -553,7 +553,7 @@ bool CVulkanCommandBuffer::endQuery_impl(IQueryPool* const queryPool, const uint bool CVulkanCommandBuffer::writeTimestamp_impl(const asset::PIPELINE_STAGE_FLAGS pipelineStage, IQueryPool* const queryPool, const uint32_t query) { - getFunctionTable().vkCmdWriteTimestamp2KHR(m_cmdbuf, getVkPipelineStageFlagsFromPipelineStageFlags(pipelineStage), static_cast(queryPool)->getInternalObject(), query); + getFunctionTable().vkCmdWriteTimestamp2(m_cmdbuf, getVkPipelineStageFlagsFromPipelineStageFlags(pipelineStage), static_cast(queryPool)->getInternalObject(), query); return true; } diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index d3dbce8e12..071c4b2843 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -15,7 +15,7 @@ uint64_t CVulkanSemaphore::getCounterValue() const { uint64_t retval = 0u; const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - vulkanDevice->getFunctionTable()->vk.vkGetSemaphoreCounterValueKHR(vulkanDevice->getInternalObject(), m_semaphore, &retval); + vulkanDevice->getFunctionTable()->vk.vkGetSemaphoreCounterValue(vulkanDevice->getInternalObject(), m_semaphore, &retval); return retval; } @@ -26,7 +26,7 @@ void CVulkanSemaphore::signal(const uint64_t value) info.value = value; const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - vulkanDevice->getFunctionTable()->vk.vkSignalSemaphoreKHR(vulkanDevice->getInternalObject(), &info); + vulkanDevice->getFunctionTable()->vk.vkSignalSemaphore(vulkanDevice->getInternalObject(), &info); } void CVulkanSemaphore::setObjectDebugName(const char* label) const From 13ff02a6abae79a2e9c21fa1581b744b9774ac51 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 22:41:57 +0100 Subject: [PATCH 22/62] fix some nasty bug in TimelineEventHandlerST --- include/nbl/video/TimelineEventHandlers.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 0f4e7015a7..33832c0605 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -20,7 +20,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable public: // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent. inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) : - m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {} + m_sema(std::move(sema)), m_greatestLatch(0) + { + m_greatestSignal = m_sema->getCounterValue(); + } // If you don't want to deadlock here, look into the `abort*` family of methods ~TimelineEventHandlerST() { @@ -189,11 +192,11 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable { return [&](FunctorValuePair& p) -> bool { - if (p.geSemaValue>m_greatestSignal) + if (bailed || p.geSemaValue>m_greatestSignal) return false; - const bool last_bailed = bailed; + const bool bailedBefore = bailed; bailed = p.func(std::forward(args)...); - return !last_bailed; + return !bailedBefore; }; } @@ -201,7 +204,7 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable inline PollResult poll_impl(Args&&... args) { PollResult retval = {}; - constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)), bool>; + constexpr bool ReturnsBool = std::is_same_v()(std::forward(args)...)),bool>; if constexpr (ReturnsBool) retval.eventsLeft = for_each_popping(constructBailing(retval.bailed, std::forward(args)...)); else @@ -223,6 +226,8 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable clear(); } + inline ILogicalDevice* getLogicalDevice() const {return m_device.get();} + inline const auto& getTimelines() const {return m_timelines;} // all the members are counteparts of the single timeline version @@ -348,6 +353,8 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable // but there's a fast path at the end else if (ReturnsBool || !allReady) it = eraseTimeline(it); + else + it++; } } // ultra fast path for non-bailing code when everything was covered by a single wait From fabc999ce72859d6c3f37226a5925424aeef32e8 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 23:14:42 +0100 Subject: [PATCH 23/62] Take the TimelineEventHandlerST for a first spin with ICommandPoolCache --- .../nbl/video/utilities/ICommandPoolCache.h | 43 +++++++++----- .../nbl/video/utilities/IDescriptorSetCache.h | 58 +++++++++++++------ src/nbl/CMakeLists.txt | 1 - src/nbl/video/utilities/ICommandPoolCache.cpp | 12 +--- .../video/utilities/IDescriptorSetCache.cpp | 25 -------- 5 files changed, 68 insertions(+), 71 deletions(-) delete mode 100644 src/nbl/video/utilities/IDescriptorSetCache.cpp diff --git a/include/nbl/video/utilities/ICommandPoolCache.h b/include/nbl/video/utilities/ICommandPoolCache.h index f86ebde930..6f384aa60b 100644 --- a/include/nbl/video/utilities/ICommandPoolCache.h +++ b/include/nbl/video/utilities/ICommandPoolCache.h @@ -8,6 +8,7 @@ #include "nbl/asset/asset.h" #include "nbl/video/IGPUCommandPool.h" +#include "nbl/video/TimelineEventHandlers.h" namespace nbl::video @@ -18,7 +19,22 @@ class ICommandPoolCache : public core::IReferenceCounted public: using CommandPoolAllocator = core::PoolAddressAllocatorST; - NBL_API2 ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag _flags, const uint32_t capacity); + // + static inline core::smart_refctd_ptr create(core::smart_refctd_ptr&& device, const uint32_t queueFamilyIx, const core::bitflag _flags, const uint32_t capacity) + { + auto cache = new core::smart_refctd_ptr[capacity]; + if (!cache) + return nullptr; + + for (auto i = 0u; icreateCommandPool(queueFamilyIx,_flags); + + void* reserved = malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u)); + if (!reserved) + return nullptr; + + return core::smart_refctd_ptr(new ICommandPoolCache(std::move(device),cache,capacity,reserved),core::dont_grab); + } // inline uint32_t getCapacity() const {return m_cmdPoolAllocator.get_total_size();} @@ -32,33 +48,26 @@ class ICommandPoolCache : public core::IReferenceCounted return nullptr; } -#if 0 // TODO: port // inline uint32_t acquirePool() { - m_deferredResets.pollForReadyEvents(DeferredCommandPoolResetter::single_poll); + m_deferredResets.poll(DeferredCommandPoolResetter::single_poll); return m_cmdPoolAllocator.alloc_addr(1u,1u); } - // needs to be called before you reset any fences which latch the deferred release - inline void poll_all() - { - m_deferredResets.pollForReadyEvents(DeferredCommandPoolResetter::exhaustive_poll); - } - // - inline void releaseSet(ILogicalDevice* device, core::smart_refctd_ptr&& fence, const uint32_t poolIx) + inline void releasePool(const ISemaphore::SWaitInfo& futureWait, const uint32_t poolIx) { if (poolIx==invalid_index) return; - if (fence) - m_deferredResets.addEvent(GPUEventWrapper(device,std::move(fence)),DeferredCommandPoolResetter(this,poolIx)); + if (futureWait.semaphore) + m_deferredResets.latch(futureWait,DeferredCommandPoolResetter(this,poolIx)); else releaseSet(poolIx); } - // only public because GPUDeferredEventHandlerST needs to know about it + // only public because MultiTimelineEventHandlerST needs to know about it class DeferredCommandPoolResetter { ICommandPoolCache* m_cache; @@ -106,13 +115,15 @@ class ICommandPoolCache : public core::IReferenceCounted NBL_API2 void operator()(); }; -#endif protected: friend class DeferredCommandPoolResetter; + inline ICommandPoolCache(core::smart_refctd_ptr&& device, core::smart_refctd_ptr* cache, const uint32_t capacity, void* reserved) : + m_cache(cache), m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u), m_deferredResets(std::move(device)) {} inline virtual ~ICommandPoolCache() { -// m_deferredResets.cullEvents(0u); + // normally the dtor would do this, but we need all the events to run before we delete the storage they reference + while (m_deferredResets.wait(std::chrono::steady_clock::now()+std::chrono::milliseconds(1))) {} free(m_reserved); delete[] m_cache; } @@ -122,7 +133,7 @@ class ICommandPoolCache : public core::IReferenceCounted core::smart_refctd_ptr* m_cache; void* m_reserved; CommandPoolAllocator m_cmdPoolAllocator; -// GPUDeferredEventHandlerST m_deferredResets; + MultiTimelineEventHandlerST m_deferredResets; }; } diff --git a/include/nbl/video/utilities/IDescriptorSetCache.h b/include/nbl/video/utilities/IDescriptorSetCache.h index c2e04906f9..c0ce5a0697 100644 --- a/include/nbl/video/utilities/IDescriptorSetCache.h +++ b/include/nbl/video/utilities/IDescriptorSetCache.h @@ -14,13 +14,33 @@ namespace nbl::video { -#if 0 // TODO: port class IDescriptorSetCache : public core::IReferenceCounted { public: using DescSetAllocator = core::PoolAddressAllocatorST; - IDescriptorSetCache(ILogicalDevice* device, core::smart_refctd_ptr&& _descPool, core::smart_refctd_ptr&& _canonicalLayout); + // + static inline core::smart_refctd_ptr create( + const uint32_t capacity, const IDescriptorPool::E_CREATE_FLAGS flags, + core::smart_refctd_ptr&& canonicalLayout + ) + { + if (capacity==0 || !canonicalLayout) + return nullptr; + void* reserved = malloc(DescSetAllocator::reserved_size(1u,capacity,1u)); + if (!reserved) + return nullptr; + auto* cache = new core::smart_refctd_ptr[capacity]; + if (!cache) + return nullptr; + auto device = const_cast(canonicalLayout->getOriginDevice()); + if (!device) + return nullptr; + auto pool = device->createDescriptorPoolForDSLayouts(flags,{&canonicalLayout.get(),1},&capacity); + if (!pool) + return nullptr; + return core::smart_refctd_ptr(new IDescriptorSetCache(std::move(pool),std::move(canonicalLayout),cache,reserved),core::dont_grab); + } // inline uint32_t getCapacity() const {return m_descPool->getCapacity();} @@ -40,26 +60,20 @@ class IDescriptorSetCache : public core::IReferenceCounted // inline uint32_t acquireSet() { - m_deferredReclaims.pollForReadyEvents(DeferredDescriptorSetReclaimer::single_poll); + m_deferredReclaims.poll(DeferredDescriptorSetReclaimer::single_poll); return m_setAllocator.alloc_addr(1u,1u); } - // needs to be called before you reset any fences which latch the deferred release - inline void poll_all() - { - m_deferredReclaims.pollForReadyEvents(DeferredDescriptorSetReclaimer::exhaustive_poll); - } - // - inline void releaseSet(ILogicalDevice* device, core::smart_refctd_ptr&& fence, const uint32_t setIx) + inline void releaseSet(const ISemaphore::SWaitInfo& futureWait, const uint32_t setIx) { if (setIx==invalid_index) return; - m_deferredReclaims.addEvent(GPUEventWrapper(device,std::move(fence)),DeferredDescriptorSetReclaimer(this,setIx)); + m_deferredReclaims.latch(futureWait,DeferredDescriptorSetReclaimer(this,setIx)); } - // only public because GPUDeferredEventHandlerST needs to know about it + // only public because MultiTimelineEventHandlerST needs to know about it class DeferredDescriptorSetReclaimer { IDescriptorSetCache* m_cache; @@ -70,7 +84,7 @@ class IDescriptorSetCache : public core::IReferenceCounted { } DeferredDescriptorSetReclaimer(const DeferredDescriptorSetReclaimer& other) = delete; - DeferredDescriptorSetReclaimer(DeferredDescriptorSetReclaimer&& other) : m_cache(nullptr), m_setIx(DescSetAllocator::invalid_address) + inline DeferredDescriptorSetReclaimer(DeferredDescriptorSetReclaimer&& other) : m_cache(nullptr), m_setIx(DescSetAllocator::invalid_address) { this->operator=(std::forward(other)); } @@ -116,10 +130,19 @@ class IDescriptorSetCache : public core::IReferenceCounted protected: friend class DeferredDescriptorSetReclaimer; - IDescriptorSetCache(ILogicalDevice* device, const uint32_t capacity); - virtual ~IDescriptorSetCache() + inline IDescriptorSetCache( + core::smart_refctd_ptr&& pool, + core::smart_refctd_ptr&& canonicalLayout, + core::smart_refctd_ptr* cache, + void* const reserved + ) : m_descPool(std::move(pool)), m_canonicalLayout(std::move(canonicalLayout)), m_cache(cache), + m_reserved(reserved), m_setAllocator(m_reserved,0u,0u,1u,m_descPool->getCapacity(),1u), + m_deferredReclaims(core::smart_refctd_ptr(const_cast(m_descPool->getOriginDevice()))) + {} + virtual inline ~IDescriptorSetCache() { - m_deferredReclaims.cullEvents(0u); + // normally the dtor would do this, but we need all the events to run before we delete the storage they reference + while (m_deferredReclaims.wait(std::chrono::steady_clock::now()+std::chrono::microseconds(100))) {} free(m_reserved); delete[] m_cache; } @@ -129,9 +152,8 @@ class IDescriptorSetCache : public core::IReferenceCounted core::smart_refctd_ptr* m_cache; void* m_reserved; DescSetAllocator m_setAllocator; - GPUDeferredEventHandlerST m_deferredReclaims; + MultiTimelineEventHandlerST m_deferredReclaims; }; -#endif } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 8297a09692..84c6b6c7ae 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -260,7 +260,6 @@ set(NBL_VIDEO_SOURCES ${NBL_ROOT_PATH}/src/nbl/video/alloc/CSimpleBufferAllocator.cpp # Utilities - ${NBL_ROOT_PATH}/src/nbl/video/utilities/IDescriptorSetCache.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/ICommandPoolCache.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/IUtilities.cpp diff --git a/src/nbl/video/utilities/ICommandPoolCache.cpp b/src/nbl/video/utilities/ICommandPoolCache.cpp index 4c38fb5dec..915a23c068 100644 --- a/src/nbl/video/utilities/ICommandPoolCache.cpp +++ b/src/nbl/video/utilities/ICommandPoolCache.cpp @@ -6,26 +6,16 @@ using namespace nbl; using namespace video; -ICommandPoolCache::ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag _flags, const uint32_t capacity) - : m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u)//, m_deferredResets() -{ - m_cache = new core::smart_refctd_ptr[capacity]; - for (auto i=0u; icreateCommandPool(queueFamilyIx,_flags); -} - void ICommandPoolCache::releaseSet(const uint32_t poolIx) { m_cache[poolIx]->reset(); m_cmdPoolAllocator.free_addr(poolIx,1); } -#if 0 void ICommandPoolCache::DeferredCommandPoolResetter::operator()() { #ifdef _NBL_DEBUG assert(m_cache && m_poolIxgetCapacity()); #endif // _NBL_DEBUG m_cache->releaseSet(m_poolIx); -} -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/src/nbl/video/utilities/IDescriptorSetCache.cpp b/src/nbl/video/utilities/IDescriptorSetCache.cpp deleted file mode 100644 index d2025bd3f2..0000000000 --- a/src/nbl/video/utilities/IDescriptorSetCache.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include "nbl/video/IPhysicalDevice.h" -#include "nbl/video/ILogicalDevice.h" -#include "nbl/video/utilities/IDescriptorSetCache.h" - -using namespace nbl; -using namespace video; - - -#if 0 // TODO: port -IDescriptorSetCache::IDescriptorSetCache(ILogicalDevice* device, const uint32_t capacity) - : m_descPool(), m_canonicalLayout(), m_reserved(malloc(DescSetAllocator::reserved_size(1u,capacity,1u))), - m_setAllocator(m_reserved,0u,0u,1u,capacity,1u), m_deferredReclaims() -{ - m_cache = new core::smart_refctd_ptr[capacity]; - std::fill_n(m_cache,capacity,nullptr); -} - -IDescriptorSetCache::IDescriptorSetCache(ILogicalDevice* device, core::smart_refctd_ptr&& _descPool, core::smart_refctd_ptr&& _canonicalLayout) : IDescriptorSetCache(device,_descPool->getCapacity()) -{ - m_descPool = std::move(_descPool); - m_canonicalLayout = std::move(_canonicalLayout); - for (auto i=0u; icreateDescriptorSet(core::smart_refctd_ptr(m_canonicalLayout)); -} -#endif \ No newline at end of file From 0eb8e9a096ac2202d28e1b476235b64d9ee01038 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 8 Jan 2024 23:29:58 +0100 Subject: [PATCH 24/62] turns out its quite easy to port the other utilities to the new MultiTimelineEventHandlerST --- include/nbl/video/ISemaphore.h | 4 +-- include/nbl/video/TimelineEventHandlers.h | 6 ++++ .../alloc/CAsyncSingleBufferSubAllocator.h | 28 +++++++++---------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index f16fa86baf..0b14590e83 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -30,8 +30,8 @@ class ISemaphore : public IBackendObject // but don't want to pollute ILogicalDevice with lots of enums and structs struct SWaitInfo { - const ISemaphore* semaphore; - uint64_t value; + const ISemaphore* semaphore = nullptr; + uint64_t value = 0; }; enum class WAIT_RESULT : uint8_t { diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 33832c0605..902f82c8aa 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -55,6 +55,12 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable return poll_impl(std::forward(args)...); } + template + static inline Clock::time_point default_wait() + { + return Clock::now()+std::chrono::microseconds(50); + } + template inline uint32_t wait(const std::chrono::time_point& timeout_time, Args&&... args) { diff --git a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h index c4d80cc7dc..f7b95464a7 100644 --- a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h +++ b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h @@ -1,19 +1,20 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h #ifndef _NBL_VIDEO_C_ASYNC_SINGLE_BUFFER_SUB_ALLOCATOR_H_ #define _NBL_VIDEO_C_ASYNC_SINGLE_BUFFER_SUB_ALLOCATOR_H_ + #include "nbl/core/alloc/GeneralpurposeAddressAllocator.h" +#include "nbl/video/alloc/CSingleBufferSubAllocator.h" +#include "nbl/video/TimelineEventHandlers.h" #include -#include "nbl/video/alloc/CSingleBufferSubAllocator.h" namespace nbl::video { -#if 0 // TODO: port namespace impl { // HostAllocator allocates both reserved space and the space needed for variable length records on the DeferredFreeFunctor @@ -134,7 +135,7 @@ class CAsyncSingleBufferSubAllocator std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); assert(tLock.owns_lock()); #endif // _NBL_DEBUG - return deferredFrees.cullEvents(0u); + return deferredFrees.poll(); } //! Returns max possible currently allocatable single allocation size, without having to wait for GPU more @@ -146,7 +147,7 @@ class CAsyncSingleBufferSubAllocator #endif // _NBL_DEBUG size_type valueToStopAt = getAddressAllocator().min_size()*3u; // padding, allocation, more padding = 3u // we don't actually want or need to poll all possible blocks to free, only first few - deferredFrees.pollForReadyEvents(valueToStopAt); + deferredFrees.poll(valueToStopAt); return getAddressAllocator().max_size(); } @@ -155,7 +156,7 @@ class CAsyncSingleBufferSubAllocator template inline size_type multi_allocate(uint32_t count, Args&&... args) noexcept { - return multi_alloc(GPUEventWrapper::default_wait(),count,std::forward(args)...); + return multi_alloc(decltype(deferredFrees)::default_wait(),count,std::forward(args)...); } //! attempt to allocate, if fail (presumably because of fragmentation), then keep trying till timeout is reached template @@ -174,7 +175,7 @@ class CAsyncSingleBufferSubAllocator // then try to wait at least once and allocate do { - deferredFrees.waitUntilForReadyEvents(maxWaitPoint,unallocatedSize); + deferredFrees.wait(maxWaitPoint,unallocatedSize); unallocatedSize = try_multi_alloc(args...); if (!unallocatedSize) @@ -185,13 +186,13 @@ class CAsyncSingleBufferSubAllocator } //! - inline void multi_deallocate(core::smart_refctd_ptr&& fence, DeferredFreeFunctor&& functor) noexcept + inline void multi_deallocate(const ISemaphore::SWaitInfo& futureWait, DeferredFreeFunctor&& functor) noexcept { #ifdef _NBL_DEBUG std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); assert(tLock.owns_lock()); #endif // _NBL_DEBUG - deferredFrees.addEvent(GPUEventWrapper(const_cast(m_composed.getBuffer()->getOriginDevice()),std::move(fence)),std::move(functor)); + deferredFrees.latch(futureWait,std::move(functor)); } inline void multi_deallocate(uint32_t count, const value_type* addr, const size_type* bytes) noexcept { @@ -203,17 +204,17 @@ class CAsyncSingleBufferSubAllocator } // TODO: improve signature of this function in the future template - inline void multi_deallocate(uint32_t count, const value_type* addr, const size_type* bytes, core::smart_refctd_ptr&& fence, const T*const *const objectsToDrop=nullptr) noexcept + inline void multi_deallocate(uint32_t count, const value_type* addr, const size_type* bytes, const ISemaphore::SWaitInfo& futureWait, const T*const *const objectsToDrop=nullptr) noexcept { - if (fence) - multi_deallocate(std::move(fence),DeferredFreeFunctor(&m_composed,count,addr,bytes,objectsToDrop)); + if (futureWait.semaphore) + multi_deallocate(futureWait,DeferredFreeFunctor(&m_composed,count,addr,bytes,objectsToDrop)); else multi_deallocate(count,addr,bytes); } protected: Composed m_composed; - GPUDeferredEventHandlerST deferredFrees; + MultiTimelineEventHandlerST deferredFrees; template inline value_type try_multi_alloc(uint32_t count, value_type* outAddresses, const size_type* byteSizes, const Args&... args) noexcept @@ -246,7 +247,6 @@ class CAsyncSingleBufferSubAllocatorST final : public core::IReferenceCounted, p template CAsyncSingleBufferSubAllocatorST(Args&&... args) : Base(std::forward(args)...) {} }; -#endif //MT version? From e59408dcc3787839af326c9f9495f562892e9ba0 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 9 Jan 2024 00:17:31 +0100 Subject: [PATCH 25/62] remove more unused stuff --- include/IVideoCapabilityReporter.h | 67 ---------------- include/IVideoDriver.h | 34 -------- .../nbl/video/alloc/CSimpleBufferAllocator.h | 40 ---------- .../video/alloc/CStreamingBufferAllocator.h | 67 ---------------- .../nbl/video/alloc/GPUMemoryAllocatorBase.h | 27 ------- .../alloc/HostDeviceMirrorBufferAllocator.h | 69 ---------------- .../video/alloc/StreamingGPUBufferAllocator.h | 78 ------------------- include/nbl/video/declarations.h | 8 +- src/nbl/CMakeLists.txt | 3 - .../video/alloc/CSimpleBufferAllocator.cpp | 23 ------ 10 files changed, 3 insertions(+), 413 deletions(-) delete mode 100644 include/IVideoCapabilityReporter.h delete mode 100644 include/IVideoDriver.h delete mode 100644 include/nbl/video/alloc/CSimpleBufferAllocator.h delete mode 100644 include/nbl/video/alloc/CStreamingBufferAllocator.h delete mode 100644 include/nbl/video/alloc/GPUMemoryAllocatorBase.h delete mode 100644 include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h delete mode 100644 include/nbl/video/alloc/StreamingGPUBufferAllocator.h delete mode 100644 src/nbl/video/alloc/CSimpleBufferAllocator.cpp diff --git a/include/IVideoCapabilityReporter.h b/include/IVideoCapabilityReporter.h deleted file mode 100644 index 39fc6a5f54..0000000000 --- a/include/IVideoCapabilityReporter.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_I_VIDEO_CAPABILITY_REPORTER_H_INCLUDED__ -#define __NBL_I_VIDEO_CAPABILITY_REPORTER_H_INCLUDED__ - - - -namespace nbl -{ -namespace video -{ - //! . - class NBL_FORCE_EBO IVideoCapabilityReporter - { - public: - //! Get type of video driver - /** \return Type of driver. */ - //virtual E_DRIVER_TYPE getDriverType() const =0; - - //! enumeration for querying features of the video driver. - enum E_DRIVER_FEATURE - { - //! Supports Alpha To Coverage (always in OpenGL 4.3+, Vulkan Mobile GPUs don't) - EDF_ALPHA_TO_COVERAGE = 0, - - //! Supports geometry shaders (always in OpenGL 4.3+, Vulkan Mobile GPUs don't) - EDF_GEOMETRY_SHADER, - - //! Supports tessellation shaders (always in OpenGL 4.3+, Vulkan Mobile GPUs don't) - EDF_TESSELLATION_SHADER, - - //! If we can attach a stencil only texture to an FBO, if not must use Depth+Stencil - EDF_STENCIL_ONLY_TEXTURE, - - //! Whether we can get gl_DrawIDARB in GLSL (if not see https://www.g-truc.net/post-0518.html for ways to circumvent) - EDF_SHADER_DRAW_PARAMS, - - //! Whether we can force overlapping pixels to not rasterize in parallel, INTEL_fragment_shader_ordering, NV_fragment_shader_interlock or ARB_fragment_shader_interlock - EDF_FRAGMENT_SHADER_INTERLOCK, - - //! Whether textures can be used by their hardware handles bindlessly (without specifying them in descriptor sets) TODO: What to do about this? - EDF_BINDLESS_TEXTURE, - - //! Whether we can index samplers dynamically in a shader TODO: only in Vulkan or NV_gpu_shader5 - EDF_DYNAMIC_SAMPLER_INDEXING, - - //! A way to pass information between fragment shader invocations covering the same pixel - EDF_INPUT_ATTACHMENTS, - - //other feature ideas are; bindless buffers, sparse texture, sparse texture 2 - - //! Only used for counting the elements of this enum - EDF_COUNT - }; - - virtual uint16_t retrieveDisplayRefreshRate() const { return 0u; } - virtual uint32_t getMaxTextureBindingsCompute() const { return 0u; } - }; - -} // end namespace video -} // end namespace nbl - - -#endif - diff --git a/include/IVideoDriver.h b/include/IVideoDriver.h deleted file mode 100644 index 5a4a5d56ed..0000000000 --- a/include/IVideoDriver.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" -// For conditions of distribution and use, see copyright notice in nabla.h -// See the original file in irrlicht source for authors - -#ifndef __NBL_I_VIDEO_DRIVER_H_INCLUDED__ -#define __NBL_I_VIDEO_DRIVER_H_INCLUDED__ - - -namespace nbl -{ -namespace video -{ -#if 0 - //! Legacy and deprecated system - class IVideoDriver : public IDriver - { - public: - //! - virtual void issueGPUTextureBarrier() =0; - - //! Event handler for resize events. Only used by the engine internally. - /** Used to notify the driver that the window was resized. - Usually, there is no need to call this method. */ - virtual void OnResize(const core::dimension2d& size) =0; - - }; -#endif - -} // end namespace video -} // end namespace nbl - - -#endif diff --git a/include/nbl/video/alloc/CSimpleBufferAllocator.h b/include/nbl/video/alloc/CSimpleBufferAllocator.h deleted file mode 100644 index 97cbe41b86..0000000000 --- a/include/nbl/video/alloc/CSimpleBufferAllocator.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_C_SIMPLE_BUFFER_ALLOCATOR_H_ -#define _NBL_VIDEO_C_SIMPLE_BUFFER_ALLOCATOR_H_ - -#include "nbl/video/IDeviceMemoryAllocator.h" -#include "nbl/video/alloc/IBufferAllocator.h" - -namespace nbl::video -{ - -class CSimpleBufferAllocator : public IBufferAllocator -{ - core::smart_refctd_ptr m_device; - uint32_t m_memoryTypesToUse; - - public: - using value_type = asset::SBufferBinding; - - CSimpleBufferAllocator(core::smart_refctd_ptr&& _device, const uint32_t _memoryTypesToUse) : m_device(std::move(_device)), m_memoryTypesToUse(_memoryTypesToUse) {} - virtual ~CSimpleBufferAllocator() = default; - - inline ILogicalDevice* getDevice() {return m_device.get();} - - value_type allocate( - IGPUBuffer::SCreationParams&& creationParams, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE - ); - - inline void deallocate(value_type& allocation) - { - allocation = {IDeviceMemoryAllocator::SAllocation::InvalidMemoryOffset,nullptr}; - } -}; - -} - -#endif - diff --git a/include/nbl/video/alloc/CStreamingBufferAllocator.h b/include/nbl/video/alloc/CStreamingBufferAllocator.h deleted file mode 100644 index 2811a96f8f..0000000000 --- a/include/nbl/video/alloc/CStreamingBufferAllocator.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H_ -#define _NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H_ - -#include "nbl/video/alloc/CSimpleBufferAllocator.h" - -namespace nbl::video -{ - -class CStreamingBufferAllocator : protected CSimpleBufferAllocator -{ - public: - struct value_type - { - typename CSimpleBufferAllocator::value_type bufferBinding; - uint8_t* ptr; - }; - - using CSimpleBufferAllocator::CSimpleBufferAllocator; - virtual ~CStreamingBufferAllocator() = default; - - inline value_type allocate(IGPUBuffer::SCreationParams&& creationParams, const core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE) - { - auto bufferBinding = CSimpleBufferAllocator::allocate(std::move(creationParams),allocateFlags); - uint8_t* mappedPtr = nullptr; - if (bufferBinding.buffer) - { - IDeviceMemoryAllocation* const mem = bufferBinding.buffer->getBoundMemory().memory; - if (mem->isCurrentlyMapped()) - { - assert(mem->getMappedRange().offset == 0ull && mem->getMappedRange().length == mem->getAllocationSize()); - mappedPtr = reinterpret_cast(mem->getMappedPointer()); - } - else - { - core::bitflag access(IDeviceMemoryAllocation::EMCAF_NO_MAPPING_ACCESS); - const auto memProps = mem->getMemoryPropertyFlags(); - if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_READABLE_BIT)) - access |= IDeviceMemoryAllocation::EMCAF_READ; - if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT)) - access |= IDeviceMemoryAllocation::EMCAF_WRITE; - assert(access.value); - IDeviceMemoryAllocation::MemoryRange memoryRange = {0ull,mem->getAllocationSize()}; - mappedPtr = reinterpret_cast(mem->map(memoryRange,access)); - } - if (!mappedPtr) - CSimpleBufferAllocator::deallocate(bufferBinding); - mappedPtr += bufferBinding.buffer->getBoundMemory().offset+bufferBinding.offset; - } - return {std::move(bufferBinding),mappedPtr}; - } - - inline void deallocate(value_type& allocation) - { - allocation.ptr = nullptr; - auto* mem = allocation.bufferBinding.buffer->getBoundMemory().memory; - if (mem->getReferenceCount()==1) - mem->unmap(); - CSimpleBufferAllocator::deallocate(allocation.bufferBinding); - } -}; - -} - -#endif diff --git a/include/nbl/video/alloc/GPUMemoryAllocatorBase.h b/include/nbl/video/alloc/GPUMemoryAllocatorBase.h deleted file mode 100644 index d75411763a..0000000000 --- a/include/nbl/video/alloc/GPUMemoryAllocatorBase.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_VIDEO_GPU_MEMORY_ALLOCATOR_BASE_H__ -#define __NBL_VIDEO_GPU_MEMORY_ALLOCATOR_BASE_H__ - -namespace nbl::video -{ - -class ILogicalDevice; - -class GPUMemoryAllocatorBase -{ - protected: - ILogicalDevice* mDriver; // TODO: change to smartpointer backlink (after declarations_and_definitions branch merge) - - GPUMemoryAllocatorBase(ILogicalDevice* inDriver) : mDriver(inDriver) {} - virtual ~GPUMemoryAllocatorBase() = default; - public: - ILogicalDevice* getDriver() noexcept {return mDriver;} -}; - -} - - -#endif diff --git a/include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h b/include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h deleted file mode 100644 index 433007b7c9..0000000000 --- a/include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_VIDEO_HOST_DEVICE_MIRROR_BUFFER_ALLOCATOR_H__ -#define __NBL_VIDEO_HOST_DEVICE_MIRROR_BUFFER_ALLOCATOR_H__ - - -#include "nbl/video/alloc/SimpleGPUBufferAllocator.h" - -namespace nbl::video -{ - -//class ILogicalDevice; - -template > -class HostDeviceMirrorBufferAllocator : protected SimpleGPUBufferAllocator -{ - HostAllocator hostAllocator; - public: - struct value_type - { - typename SimpleGPUBufferAllocator::value_type buffer; - uint8_t* ptr; // maybe a ICPUBuffer in the future? - }; - - HostDeviceMirrorBufferAllocator(ILogicalDevice* inDriver); - virtual ~HostDeviceMirrorBufferAllocator() = default; - - inline value_type allocate(size_t bytes, size_t alignment) noexcept - { - auto buff = SimpleGPUBufferAllocator::allocate(bytes,alignment); - if (!buff) - return {nullptr,nullptr}; - auto hostPtr = hostAllocator.allocate(bytes,alignment); - if (!hostPtr) - { - SimpleGPUBufferAllocator::deallocate(buff); - return {nullptr,nullptr}; - } - return {std::move(buff),hostPtr}; - } - - inline void deallocate(value_type& allocation) noexcept - { - hostAllocator.deallocate(allocation.ptr,allocation.buffer->getSize()); - SimpleGPUBufferAllocator::deallocate(allocation.buffer); - allocation.ptr = nullptr; - } -#if 0 - //to expose base functions again - IDriver* getDriver() noexcept {return SimpleGPUBufferAllocator::getDriver();} -#endif -}; - - -} - -#include "nbl/video/ILogicalDevice.h" - -namespace nbl::video -{ - -template -HostDeviceMirrorBufferAllocator::HostDeviceMirrorBufferAllocator(ILogicalDevice* inDriver) : SimpleGPUBufferAllocator(inDriver,inDriver->getDeviceLocalGPUMemoryReqs()) {} - -} - -#endif diff --git a/include/nbl/video/alloc/StreamingGPUBufferAllocator.h b/include/nbl/video/alloc/StreamingGPUBufferAllocator.h deleted file mode 100644 index 0c01f6703e..0000000000 --- a/include/nbl/video/alloc/StreamingGPUBufferAllocator.h +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H__ -#define __NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H__ - -#include "nbl/video/alloc/SimpleGPUBufferAllocator.h" - -namespace nbl::video -{ - -//class ILogicalDevice; - -class StreamingGPUBufferAllocator : protected SimpleGPUBufferAllocator -{ - private: - void* mapWrapper(IDriverMemoryAllocation* mem, IDriverMemoryAllocation::E_MAPPING_CPU_ACCESS_FLAG access, const IDriverMemoryAllocation::MemoryRange& range) noexcept; - void unmapWrapper(IDriverMemoryAllocation* mem) noexcept; - - public: - struct value_type - { - typename SimpleGPUBufferAllocator::value_type buffer; - uint8_t* ptr; - }; - - StreamingGPUBufferAllocator(ILogicalDevice* inDriver, const IDriverMemoryBacked::SDriverMemoryRequirements& bufferReqs) : SimpleGPUBufferAllocator(inDriver,bufferReqs) - { - assert(mBufferMemReqs.mappingCapability&IDriverMemoryAllocation::EMCAF_READ_AND_WRITE); // have to have mapping access to the buffer! - } - virtual ~StreamingGPUBufferAllocator() = default; - - inline value_type allocate(size_t bytes, size_t alignment) noexcept - { - auto buff = SimpleGPUBufferAllocator::allocate(bytes,alignment); - if (!buff) - return {nullptr,nullptr}; - auto* const mem = buff->getBoundMemory(); - uint8_t* mappedPtr; - if (mem->isCurrentlyMapped()) - { - assert(mem->getMappedRange().offset==0ull && mem->getMappedRange().length==mem->getAllocationSize()); // whole range must be mapped always - mappedPtr = reinterpret_cast(mem->getMappedPointer()); - } - else - { - const auto mappingCaps = mem->getMappingCaps()&IDriverMemoryAllocation::EMCAF_READ_AND_WRITE; - const auto rangeToMap = IDriverMemoryAllocation::MemoryRange{0u,mem->getAllocationSize()}; - mappedPtr = reinterpret_cast(mapWrapper(mem,static_cast(mappingCaps),rangeToMap)); - } - if (!mappedPtr) - { - SimpleGPUBufferAllocator::deallocate(buff); - return {nullptr,nullptr}; - } - mappedPtr += buff->getBoundMemoryOffset(); - return {std::move(buff),mappedPtr}; - } - - inline void deallocate(value_type& allocation) noexcept - { - allocation.ptr = nullptr; - auto* mem = allocation.buffer->getBoundMemory(); - if (mem->getReferenceCount()==1) - unmapWrapper(mem); - SimpleGPUBufferAllocator::deallocate(allocation.buffer); - } -#if 0 - //to expose base functions again - ILogicalDevice* getDriver() noexcept {return SimpleGPUBufferAllocator::getDriver();} -#endif -}; - - -} - -#endif diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index 38e6d2cca6..a300dc1c2d 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -1,9 +1,8 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_VIDEO_DECLARATIONS_H_INCLUDED__ -#define __NBL_VIDEO_DECLARATIONS_H_INCLUDED__ +#ifndef _NBL_VIDEO_DECLARATIONS_H_INCLUDED_ +#define _NBL_VIDEO_DECLARATIONS_H_INCLUDED_ // dependencies @@ -16,7 +15,6 @@ //#include "nbl/video/asset_traits.h" // alloc -#include "nbl/video/alloc/CStreamingBufferAllocator.h" #include "nbl/video/alloc/StreamingTransientDataBuffer.h" // platform and API specific stuff diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 84c6b6c7ae..517485d08c 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -256,9 +256,6 @@ set(NBL_ASSET_SOURCES ${NBL_ROOT_PATH}/src/nbl/asset/material_compiler/CMaterialCompilerGLSLRasterBackend.cpp ) set(NBL_VIDEO_SOURCES -# Allocators - ${NBL_ROOT_PATH}/src/nbl/video/alloc/CSimpleBufferAllocator.cpp - # Utilities ${NBL_ROOT_PATH}/src/nbl/video/utilities/ICommandPoolCache.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp diff --git a/src/nbl/video/alloc/CSimpleBufferAllocator.cpp b/src/nbl/video/alloc/CSimpleBufferAllocator.cpp deleted file mode 100644 index 54666f9b64..0000000000 --- a/src/nbl/video/alloc/CSimpleBufferAllocator.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#include "nbl/video/IPhysicalDevice.h" -#include "nbl/video/ILogicalDevice.h" -#include "nbl/video/alloc/CSimpleBufferAllocator.h" - -using namespace nbl; -using namespace video; - -CSimpleBufferAllocator::value_type CSimpleBufferAllocator::allocate( - IGPUBuffer::SCreationParams&& creationParams, - const core::bitflag allocateFlags) -{ - auto buffer = m_device->createBuffer(std::move(creationParams)); - auto reqs = buffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_memoryTypesToUse; - auto mem = m_device->allocate(reqs,buffer.get(),allocateFlags); - if (!mem.memory) - return {0xdeadbeefull,nullptr}; - return {0ull,std::move(buffer)}; -} \ No newline at end of file From 3f41a81b8515ae5ee79d2e271776886732c7061f Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 9 Jan 2024 00:50:17 +0100 Subject: [PATCH 26/62] fix one liner huge bug --- include/nbl/builtin/hlsl/cpp_compat/intrinsics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index 1c75abe891..c606bb4d58 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -53,7 +53,7 @@ inline T determinant(const matrix& m) NBL_BIT_OP_GLM_PASSTHROUGH(findLSB,findLSB) -NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findLSB) +NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB) // inverse not defined cause its implemented via hidden friend template From fb1f50dc6b58564fb137dc0c78efa00e2ce46bcc Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 9 Jan 2024 01:07:39 +0100 Subject: [PATCH 27/62] fix a smal bug and introduce a base class for TimelineEventHandler, also get everything to compile --- include/nbl/video/TimelineEventHandlers.h | 48 +++++++++++-------- .../alloc/CAsyncSingleBufferSubAllocator.h | 7 +-- .../alloc/StreamingTransientDataBuffer.h | 24 +++++----- 3 files changed, 45 insertions(+), 34 deletions(-) diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 902f82c8aa..925829c68f 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -9,28 +9,48 @@ namespace nbl::video { +class TimelineEventHandlerBase : core::Unmovable, core::Uncopyable +{ + public: + struct PollResult + { + uint32_t eventsLeft = ~0u; + bool bailed = false; + }; + + // little utility + inline const ISemaphore* getSemaphore() const { return m_sema.get(); } + + // todo: rename to default_wait_point ? + template + static inline Clock::time_point default_wait() + { + return Clock::now()+std::chrono::microseconds(50); + } + + protected: + TimelineEventHandlerBase(core::smart_refctd_ptr&& sema) : m_sema(std::move(sema)) {} + + core::smart_refctd_ptr m_sema; +}; + template class MultiTimelineEventHandlerST; // Could be made MT and relatively lockless, if only had a good lock-few circular buffer impl // Not sure its worth the effort as anything using this will probably need to be lockful to be MT template -class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable +class TimelineEventHandlerST final : TimelineEventHandlerBase { public: // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent. inline TimelineEventHandlerST(core::smart_refctd_ptr&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) : - m_sema(std::move(sema)), m_greatestLatch(0) - { - m_greatestSignal = m_sema->getCounterValue(); - } + TimelineEventHandlerBase(std::move(sema)), m_greatestLatch(0), m_greatestSignal(m_sema->getCounterValue()) {} // If you don't want to deadlock here, look into the `abort*` family of methods ~TimelineEventHandlerST() { while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {} } - // little utility - inline const ISemaphore* getSemaphore() const {return m_sema.get();} inline uint32_t count() const {return m_cb.size();} @@ -44,23 +64,12 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable } // - struct PollResult - { - uint32_t eventsLeft = ~0u; - bool bailed = false; - }; template inline PollResult poll(Args&&... args) { return poll_impl(std::forward(args)...); } - template - static inline Clock::time_point default_wait() - { - return Clock::now()+std::chrono::microseconds(50); - } - template inline uint32_t wait(const std::chrono::time_point& timeout_time, Args&&... args) { @@ -160,7 +169,6 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable }; // could be a circular buffer but whatever for now core::deque m_cb; - core::smart_refctd_ptr m_sema; uint64_t m_greatestSignal; uint64_t m_greatestLatch; @@ -410,7 +418,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable inline container_t::iterator eraseTimeline(typename container_t::iterator timeline) { // if not the last in scratch - if (timeline->waitInfoIxwaitInfoIx+1 - inline CAsyncSingleBufferSubAllocator(Args&&... args) : m_composed(std::forward(args)...) {} + inline CAsyncSingleBufferSubAllocator(Args&&... args) : m_composed(std::forward(args)...), + deferredFrees(core::smart_refctd_ptr(const_cast(m_composed.getBuffer()->getOriginDevice()))) {} virtual ~CAsyncSingleBufferSubAllocator() {} @@ -135,7 +136,7 @@ class CAsyncSingleBufferSubAllocator std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); assert(tLock.owns_lock()); #endif // _NBL_DEBUG - return deferredFrees.poll(); + return deferredFrees.poll().eventsLeft; } //! Returns max possible currently allocatable single allocation size, without having to wait for GPU more @@ -200,7 +201,7 @@ class CAsyncSingleBufferSubAllocator std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); assert(tLock.owns_lock()); #endif // _NBL_DEBUG - multi_deallocate(count,addr,bytes,nullptr); + multi_deallocate(count,addr,bytes,{}); } // TODO: improve signature of this function in the future template diff --git a/include/nbl/video/alloc/StreamingTransientDataBuffer.h b/include/nbl/video/alloc/StreamingTransientDataBuffer.h index 27c3ec7e81..fd8eee7347 100644 --- a/include/nbl/video/alloc/StreamingTransientDataBuffer.h +++ b/include/nbl/video/alloc/StreamingTransientDataBuffer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h #ifndef _NBL_VIDEO_STREAMING_TRANSIENT_DATA_BUFFER_H_ @@ -15,7 +15,6 @@ namespace nbl::video { -#if 0 // TODO: port template, class RecursiveLockable=std::recursive_mutex> class StreamingTransientDataBufferMT; @@ -39,24 +38,28 @@ class StreamingTransientDataBuffer template inline StreamingTransientDataBuffer(asset::SBufferRange&& _bufferRange, Args&&... args) : m_composed(std::move(_bufferRange),std::forward(args)...) { - assert(getBuffer()->getBoundMemory()->isMappable()); - assert(getBuffer()->getBoundMemory()->getMappedPointer()); + assert(getBuffer()->getBoundMemory().memory->isMappable()); + assert(getBuffer()->getBoundMemory().memory->getMappedPointer()); // we're suballocating from a buffer, whole buffer needs to be reachable from the mapped pointer - const auto mappedRange = getBuffer()->getBoundMemory()->getMappedRange(); - assert(mappedRange.offset<=getBuffer()->getBoundMemoryOffset()); - assert(mappedRange.offset+mappedRange.length>=getBuffer()->getBoundMemoryOffset()+getBuffer()->getSize()); + const auto mappedRange = getBuffer()->getBoundMemory().memory->getMappedRange(); + assert(mappedRange.offset<=getBuffer()->getBoundMemory().offset); + assert(mappedRange.offset+mappedRange.length>=getBuffer()->getBoundMemory().offset+getBuffer()->getSize()); } virtual ~StreamingTransientDataBuffer() {} // - inline bool needsManualFlushOrInvalidate() const {return getBuffer()->getBoundMemory()->haveToMakeVisible();} + inline bool needsManualFlushOrInvalidate() const {return getBuffer()->getBoundMemory().memory->haveToMakeVisible();} // getters inline IGPUBuffer* getBuffer() noexcept {return m_composed.getBuffer();} inline const IGPUBuffer* getBuffer() const noexcept {return m_composed.getBuffer();} // - inline void* getBufferPointer() noexcept {return getBuffer()->getBoundMemory()->getMappedPointer();} + inline void* getBufferPointer() noexcept + { + const auto bound = getBuffer()->getBoundMemory(); + return reinterpret_cast(bound.memory->getMappedPointer())+bound.offset; + } // inline uint32_t cull_frees() noexcept {return m_composed.cull_frees();} @@ -99,7 +102,7 @@ class StreamingTransientDataBuffer template inline size_type multi_place(uint32_t count, Args&&... args) noexcept { - return multi_place(GPUEventWrapper::default_wait(), count, std::forward(args)...); + return multi_place(TimelineEventHandlerBase::default_wait(),count,std::forward(args)...); } }; } @@ -206,7 +209,6 @@ class StreamingTransientDataBufferMT : public core::IReferenceCounted return lock; } }; -#endif } From 94ee6805d671cff23b2b42e58e3157dc64e32ee0 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 9 Jan 2024 01:09:23 +0100 Subject: [PATCH 28/62] fix one more KHR function pointer bug and remove unused class --- .../nbl/video/alloc/SubAllocatedDataBuffer.h | 304 ------------------ src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 2 files changed, 1 insertion(+), 305 deletions(-) delete mode 100644 include/nbl/video/alloc/SubAllocatedDataBuffer.h diff --git a/include/nbl/video/alloc/SubAllocatedDataBuffer.h b/include/nbl/video/alloc/SubAllocatedDataBuffer.h deleted file mode 100644 index 8b09d1bf45..0000000000 --- a/include/nbl/video/alloc/SubAllocatedDataBuffer.h +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_VIDEO_SUB_ALLOCATED_DATA_BUFFER_H__ -#define __NBL_VIDEO_SUB_ALLOCATED_DATA_BUFFER_H__ - -#include "nbl/core/declarations.h" - -#include -#include - -#include "nbl/video/alloc/SimpleGPUBufferAllocator.h" -#include "nbl/video/IGPUFence.h" - -namespace nbl::video -{ - -namespace impl -{ -template -class SubAllocatedDataBuffer : protected core::impl::FriendOfHeterogenousMemoryAddressAllocatorAdaptor -{ - public: - typedef typename HeterogenousMemoryAddressAllocator::OtherAllocatorType GPUBufferAllocator; - typedef typename HeterogenousMemoryAddressAllocator::HostAllocatorType CPUAllocator; - typedef typename HeterogenousMemoryAddressAllocator::size_type size_type; - static constexpr size_type invalid_address = HeterogenousMemoryAddressAllocator::invalid_address; - - private: - #ifdef _NBL_DEBUG - std::recursive_mutex stAccessVerfier; - #endif // _NBL_DEBUG - typedef SubAllocatedDataBuffer ThisType; - - template using buffer_type = decltype(U::buffer); - template struct has_buffer_member : std::false_type {}; - template struct has_buffer_member> > : std::is_same,core::smart_refctd_ptr> {}; - protected: - HeterogenousMemoryAddressAllocator mAllocator; - ILogicalDevice* mDevice; // TODO: smartpointer backlink - - template - inline size_type try_multi_alloc(uint32_t count, size_type* outAddresses, const size_type* bytes, const Args&... args) noexcept - { - mAllocator.multi_alloc_addr(count,outAddresses,bytes,args...); - - size_type unallocatedSize = 0; - for (uint32_t i=0u; i - inline DefaultDeferredFreeFunctor(ThisType* _this, size_type numAllocsToFree, const size_type* addrs, const size_type* bytes, const T*const *const objectsToHold) - : sadbRef(_this), rangeData(nullptr), numAllocs(numAllocsToFree) - { - static_assert(std::is_base_of_v); - - rangeData = reinterpret_cast(sadbRef->getFunctorAllocator().allocate(numAllocs,sizeof(void*))); - auto out = rangeData; - memcpy(out,addrs,sizeof(size_type)*numAllocs); - out += numAllocs; - memcpy(out,bytes,sizeof(size_type)*numAllocs); - out += numAllocs; - auto* const objHoldIt = reinterpret_cast*>(out); - for (size_t i=0u; i(out)[i] = nullptr; // clear it first - if (objectsToHold) - objHoldIt[i] = core::smart_refctd_ptr(objectsToHold[i]); - } - } - DefaultDeferredFreeFunctor(const DefaultDeferredFreeFunctor& other) = delete; - inline DefaultDeferredFreeFunctor(DefaultDeferredFreeFunctor&& other) : sadbRef(nullptr), rangeData(nullptr), numAllocs(0u) - { - this->operator=(std::forward(other)); - } - - inline ~DefaultDeferredFreeFunctor() - { - if (rangeData) - { - auto alloctr = sadbRef->getFunctorAllocator(); - alloctr.deallocate(reinterpret_cast::pointer>(rangeData),numAllocs); - } - } - - DefaultDeferredFreeFunctor& operator=(const DefaultDeferredFreeFunctor& other) = delete; - inline DefaultDeferredFreeFunctor& operator=(DefaultDeferredFreeFunctor&& other) - { - if (rangeData) // could swap the values instead - { - auto alloctr = sadbRef->getFunctorAllocator(); - alloctr.deallocate(reinterpret_cast::pointer>(rangeData),numAllocs); - } - sadbRef = other.sadbRef; - rangeData = other.rangeData; - numAllocs = other.numAllocs; - other.sadbRef = nullptr; - other.rangeData = nullptr; - other.numAllocs = 0u; - return *this; - } - - inline bool operator()(size_type& unallocatedSize) - { - operator()(); - for (size_type i=0u; ifreedSize) - unallocatedSize -= freedSize; - else - { - unallocatedSize = 0u; - return true; - } - } - return unallocatedSize==0u; - } - - inline void operator()() - { - #ifdef _NBL_DEBUG - assert(sadbRef && rangeData); - #endif // _NBL_DEBUG - HeterogenousMemoryAddressAllocator& alloctr = sadbRef->getAllocator(); - alloctr.multi_free_addr(numAllocs,rangeData,rangeData+numAllocs); - auto* const objHoldIt = reinterpret_cast*>(rangeData+numAllocs*2u); - for (size_t i=0u; i::value; - using DeferredFreeFunctor = std::conditional_t; - GPUDeferredEventHandlerST deferredFrees; - core::allocator > functorAllocator; // TODO : CMemoryPool a-la naughty do - public: - SubAllocatedDataBuffer() {} - - virtual ~SubAllocatedDataBuffer() {} - - //! - template - SubAllocatedDataBuffer(ILogicalDevice* dev, Args&&... args) : mAllocator(std::forward(args)...), mDevice(dev) - { - #ifdef _NBL_DEBUG - std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); - assert(tLock.owns_lock()); - #endif // _NBL_DEBUG - } - - - //! Mutable version for `DefaultDeferredFreeFunctor` and `StreamingTransientDataBuffer` ONLY! - inline HeterogenousMemoryAddressAllocator& getAllocator() noexcept { return mAllocator; } - //! - const HeterogenousMemoryAddressAllocator& getAllocator() const {return mAllocator;} - - //! - inline const IGPUBuffer* getBuffer() const noexcept - { - auto allocation = mAllocator.getCurrentBufferAllocation(); - - IGPUBuffer* retval; - if constexpr(has_buffer_member::value) - { - retval = allocation.buffer.get(); - } - else - { - retval = allocation.get(); - } - - - return retval; - } - inline IGPUBuffer* getBuffer() noexcept - { - return const_cast(static_cast(this)->getBuffer()); - } - - //! - inline uint32_t cull_frees() noexcept - { - #ifdef _NBL_DEBUG - std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); - assert(tLock.owns_lock()); - #endif // _NBL_DEBUG - return deferredFrees.cullEvents(0u); - } - - //! Returns max possible currently allocatable single allocation size, without having to wait for GPU more - inline size_type max_size() noexcept - { - #ifdef _NBL_DEBUG - std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); - assert(tLock.owns_lock()); - #endif // _NBL_DEBUG - size_type valueToStopAt = mAllocator.getAddressAllocator().min_size()*3u; // padding, allocation, more padding = 3u - // we don't actually want or need to poll all possible blocks to free, only first few - deferredFrees.pollForReadyEvents(valueToStopAt); - return mAllocator.getAddressAllocator().max_size(); - } - //! Returns max requestable alignment on the allocation (w.r.t. backing memory start) - inline size_type max_alignment() const noexcept {return mAllocator.getAddressAllocator().max_alignment();} - - - //! - template - inline size_type multi_alloc(uint32_t count, Args&&... args) noexcept - { - return multi_alloc(GPUEventWrapper::default_wait(),count,std::forward(args)...); - } - //! - template - inline size_type multi_alloc(const std::chrono::time_point& maxWaitPoint, const Args&... args) noexcept - { - #ifdef _NBL_DEBUG - std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); - assert(tLock.owns_lock()); - #endif // _NBL_DEBUG - - // try allocate once - size_type unallocatedSize = try_multi_alloc(args...); - if (!unallocatedSize) - return 0u; - - // then try to wait at least once and allocate - do - { - deferredFrees.waitUntilForReadyEvents(maxWaitPoint,unallocatedSize); - - unallocatedSize = try_multi_alloc(args...); - if (!unallocatedSize) - return 0u; - } while(Clock::now()&& fence, DeferredFreeFunctor&& functor) noexcept - { - #ifdef _NBL_DEBUG - std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); - assert(tLock.owns_lock()); - #endif // _NBL_DEBUG - deferredFrees.addEvent(GPUEventWrapper(mDevice, std::move(fence)),std::forward(functor)); - } - inline void multi_free(uint32_t count, const size_type* addr, const size_type* bytes) noexcept - { - #ifdef _NBL_DEBUG - std::unique_lock tLock(stAccessVerfier,std::try_to_lock_t()); - assert(tLock.owns_lock()); - #endif // _NBL_DEBUG - mAllocator.multi_free_addr(count,addr,bytes); - } - template - inline void multi_free(uint32_t count, const size_type* addr, const size_type* bytes, core::smart_refctd_ptr&& fence, const T*const *const objectsToDrop=nullptr) noexcept - { - if (fence) - multi_free(std::move(fence),DeferredFreeFunctor(this,count,addr,bytes,objectsToDrop)); - else - multi_free(count,addr,bytes); - } -}; -} - -// this buffer is not growable -template< typename _size_type=uint32_t, class BasicAddressAllocator=core::GeneralpurposeAddressAllocator<_size_type>, class GPUBufferAllocator=SimpleGPUBufferAllocator, class CPUAllocator=core::allocator > -class SubAllocatedDataBufferST : public core::IReferenceCounted, public impl::SubAllocatedDataBuffer > -{ - using Base = impl::SubAllocatedDataBuffer >; - protected: - ~SubAllocatedDataBufferST() = default; - public: - template - SubAllocatedDataBufferST(Args&&... args) : Base(std::forward(args)...) {} -}; - - -//MT version? - -} - -#endif - - - - diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 0714481ac8..844bfc54cb 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -307,7 +307,7 @@ bool CVulkanLogicalDevice::bindBufferMemory_impl(const uint32_t count, const SBi { VkBufferDeviceAddressInfoKHR info = {VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR,nullptr}; info.buffer = vulkanBuffer->getInternalObject(); - vulkanBuffer->setDeviceAddress(m_devf.vk.vkGetBufferDeviceAddressKHR(m_vkdev,&info)); + vulkanBuffer->setDeviceAddress(m_devf.vk.vkGetBufferDeviceAddress(m_vkdev,&info)); } } return true; From c761d424dc5c51754a731de29a416d93315957a1 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 9 Jan 2024 04:43:05 +0100 Subject: [PATCH 29/62] bring back bits of IUtilities needed for ex 05 --- include/nbl/video/utilities/IUtilities.h | 82 ++++++++++++------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 029df30144..89d45da3f7 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -1,3 +1,6 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _NBL_VIDEO_I_UTILITIES_H_INCLUDED_ #define _NBL_VIDEO_I_UTILITIES_H_INCLUDED_ @@ -16,7 +19,6 @@ namespace nbl::video { -#if 0 // TODO: port class NBL_API2 IUtilities : public core::IReferenceCounted { protected: @@ -29,9 +31,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted nbl::system::logger_opt_smart_ptr m_logger; public: - IUtilities(core::smart_refctd_ptr&& device, nbl::system::logger_opt_smart_ptr&& logger = nullptr, const uint32_t downstreamSize = 0x4000000u, const uint32_t upstreamSize = 0x4000000u) - : m_device(std::move(device)) - , m_logger(std::move(logger)) + IUtilities(core::smart_refctd_ptr&& device, nbl::system::logger_opt_smart_ptr&& logger=nullptr, const uint32_t downstreamSize=0x4000000u, const uint32_t upstreamSize=0x4000000u) + : m_device(std::move(device)), m_logger(std::move(logger)) { auto physicalDevice = m_device->getPhysicalDevice(); const auto& limits = physicalDevice->getLimits(); @@ -39,22 +40,23 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto queueFamProps = physicalDevice->getQueueFamilyProperties(); uint32_t minImageTransferGranularityVolume = 1u; // minImageTransferGranularity.width * height * depth - for (uint32_t i = 0; i < queueFamProps.size(); i++) + for (auto& qf : queueFamProps) { - uint32_t volume = queueFamProps[i].minImageTransferGranularity.width * queueFamProps[i].minImageTransferGranularity.height * queueFamProps[i].minImageTransferGranularity.depth; - if(minImageTransferGranularityVolume < volume) + uint32_t volume = qf.minImageTransferGranularity.width*qf.minImageTransferGranularity.height*qf.minImageTransferGranularity.depth; + if(minImageTransferGranularityVolume(limits.nonCoherentAtomSize); - m_allocationAlignmentForBufferImageCopy = core::max(static_cast(limits.optimalBufferCopyOffsetAlignment), m_allocationAlignment); + m_allocationAlignment = limits.nonCoherentAtomSize; + m_allocationAlignmentForBufferImageCopy = core::max(limits.optimalBufferCopyOffsetAlignment,m_allocationAlignment); - const uint32_t bufferOptimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t); + constexpr uint32_t OptimalCoalescedInvocationXferSize = sizeof(uint32_t); + const uint32_t bufferOptimalTransferAtom = limits.maxResidentInvocations * OptimalCoalescedInvocationXferSize; const uint32_t maxImageOptimalTransferAtom = limits.maxResidentInvocations * asset::TexelBlockInfo(asset::EF_R64G64B64A64_SFLOAT).getBlockByteSize() * minImageTransferGranularityVolume; - const uint32_t minImageOptimalTransferAtom = limits.maxResidentInvocations * asset::TexelBlockInfo(asset::EF_R8_UINT).getBlockByteSize();; - const uint32_t maxOptimalTransferAtom = core::max(bufferOptimalTransferAtom, maxImageOptimalTransferAtom); - const uint32_t minOptimalTransferAtom = core::min(bufferOptimalTransferAtom, minImageOptimalTransferAtom); + const uint32_t minImageOptimalTransferAtom = limits.maxResidentInvocations * asset::TexelBlockInfo(asset::EF_R8_UINT).getBlockByteSize(); + const uint32_t maxOptimalTransferAtom = core::max(bufferOptimalTransferAtom,maxImageOptimalTransferAtom); + const uint32_t minOptimalTransferAtom = core::min(bufferOptimalTransferAtom,minImageOptimalTransferAtom); // allocationAlignment <= minBlockSize <= minOptimalTransferAtom <= maxOptimalTransferAtom <= stagingBufferSize/4 assert(m_allocationAlignment <= minStreamingBufferAllocationSize); @@ -62,8 +64,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted assert(minStreamingBufferAllocationSize <= minOptimalTransferAtom); - assert(maxOptimalTransferAtom * 4u <= upstreamSize); - assert(maxOptimalTransferAtom * 4u <= downstreamSize); + assert(maxOptimalTransferAtom*OptimalCoalescedInvocationXferSize <= upstreamSize); + assert(maxOptimalTransferAtom*OptimalCoalescedInvocationXferSize <= downstreamSize); assert(minStreamingBufferAllocationSize % m_allocationAlignment == 0u); assert(minStreamingBufferAllocationSize % m_allocationAlignmentForBufferImageCopy == 0u); @@ -71,15 +73,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted const auto& enabledFeatures = m_device->getEnabledFeatures(); IGPUBuffer::SCreationParams streamingBufferCreationParams = {}; - auto commonUsages = core::bitflag(IGPUBuffer::EUF_STORAGE_TEXEL_BUFFER_BIT)|IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - if(enabledFeatures.bufferDeviceAddress) - commonUsages |= IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + auto commonUsages = core::bitflag(IGPUBuffer::EUF_STORAGE_TEXEL_BUFFER_BIT)|IGPUBuffer::EUF_STORAGE_BUFFER_BIT|IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; if (enabledFeatures.accelerationStructure) commonUsages |= IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - core::bitflag allocateFlags(IDeviceMemoryAllocation::EMAF_NONE); - if(enabledFeatures.bufferDeviceAddress) - allocateFlags |= IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT; + core::bitflag allocateFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); { IGPUBuffer::SCreationParams streamingBufferCreationParams = {}; @@ -102,8 +100,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT)) access |= IDeviceMemoryAllocation::EMCAF_WRITE; assert(access.value); - IDeviceMemoryAllocation::MappedMemoryRange memoryRange = {mem.get(),0ull,mem->getAllocationSize()}; - m_device->mapMemory(memoryRange, access); + mem->map({0ull,reqs.size},access); m_defaultDownloadBuffer = core::make_smart_refctd_ptr>(asset::SBufferRange{0ull,downstreamSize,std::move(buffer)},maxStreamingBufferAllocationAlignment,minStreamingBufferAllocationSize); m_defaultDownloadBuffer->getBuffer()->setObjectDebugName(("Default Download Buffer of Utilities "+std::to_string(ptrdiff_t(this))).c_str()); @@ -130,23 +127,22 @@ class NBL_API2 IUtilities : public core::IReferenceCounted if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT)) access |= IDeviceMemoryAllocation::EMCAF_WRITE; assert(access.value); - IDeviceMemoryAllocation::MappedMemoryRange memoryRange = {mem.get(),0ull,mem->getAllocationSize()}; - m_device->mapMemory(memoryRange, access); + mem->map({0ull,reqs.size},access); m_defaultUploadBuffer = core::make_smart_refctd_ptr>(asset::SBufferRange{0ull,upstreamSize,std::move(buffer)},maxStreamingBufferAllocationAlignment,minStreamingBufferAllocationSize); m_defaultUploadBuffer->getBuffer()->setObjectDebugName(("Default Upload Buffer of Utilities "+std::to_string(ptrdiff_t(this))).c_str()); } +#if 0 // TODO: port m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); // smaller workgroups fill occupancy gaps better, especially on new Nvidia GPUs, but we don't want too small workgroups on mobile // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u); m_scanner = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device), scan_workgroup_size); +#endif } - ~IUtilities() + inline ~IUtilities() { - m_device->unmapMemory(m_defaultDownloadBuffer->getBuffer()->getBoundMemory()); - m_device->unmapMemory(m_defaultUploadBuffer->getBuffer()->getBoundMemory()); } //! @@ -162,6 +158,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return m_defaultDownloadBuffer.get(); } +#if 0 // TODO: port //! virtual CPropertyPoolHandler* getDefaultPropertyPoolHandler() const { @@ -173,7 +170,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted { return m_scanner.get(); } - +#endif //! This function provides some guards against streamingBuffer fragmentation or allocation failure static uint32_t getAllocationSizeForStreamingBuffer(const size_t size, const uint64_t alignment, uint32_t maxFreeBlock, const uint32_t optimalTransferAtom) { @@ -198,6 +195,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return allocationSize; } +#if 0 // TODO: port //! WARNING: This function blocks the CPU and stalls the GPU! inline core::smart_refctd_ptr createFilledDeviceLocalBufferOnDedMem(IQueue* queue, IGPUBuffer::SCreationParams&& params, const void* data) { @@ -396,6 +394,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted // pipelineBarrierAutoSubmit? +#endif // -------------- // downloadBufferRangeViaStagingBuffer @@ -406,9 +405,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted struct default_data_consumption_callback_t { - default_data_consumption_callback_t(void* dstPtr) : - m_dstPtr(dstPtr) - {} + default_data_consumption_callback_t(void* dstPtr) : m_dstPtr(dstPtr) {} inline void operator()(const size_t dstOffset, const void* srcPtr, const size_t size) { @@ -444,8 +441,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted if (m_downstreamingBuffer->needsManualFlushOrInvalidate()) { const auto nonCoherentAtomSize = device->getPhysicalDevice()->getLimits().nonCoherentAtomSize; - auto flushRange = AlignedMappedMemoryRange(m_downstreamingBuffer->getBuffer()->getBoundMemory(), m_copyRange.offset, m_copyRange.length, nonCoherentAtomSize); - device->invalidateMappedMemoryRanges(1u, &flushRange); + auto flushRange = AlignedMappedMemoryRange(m_downstreamingBuffer->getBuffer()->getBoundMemory().memory,m_copyRange.offset,m_copyRange.length,nonCoherentAtomSize); + device->invalidateMappedMemoryRanges(1u,&flushRange); } // Call the function const uint8_t* copySrc = reinterpret_cast(m_downstreamingBuffer->getBufferPointer()) + m_copyRange.offset; @@ -459,7 +456,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted StreamingTransientDataBufferMT<>* m_downstreamingBuffer; const size_t m_dstOffset; }; - +#if 0 // TODO: port //! Calls the callback to copy the data to a destination Offset //! * IMPORTANT: To make the copies ready, IUtility::getDefaultDownStreamingBuffer()->cull_frees() should be called after the `submissionFence` is signaled. //! If the allocation from staging memory fails due to large image size or fragmentation then This function may need to submit the command buffer via the `submissionQueue` and then signal the fence. @@ -742,20 +739,21 @@ class NBL_API2 IUtilities : public core::IReferenceCounted asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange& regions, IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {} ); +#endif - protected: - + protected: // The application must round down the start of the range to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize, // and round the end of the range up to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize. - static IDeviceMemoryAllocation::MappedMemoryRange AlignedMappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t& off, const size_t& len, size_t nonCoherentAtomSize) + static ILogicalDevice::MappedMemoryRange AlignedMappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t& off, const size_t& len, size_t nonCoherentAtomSize) { - IDeviceMemoryAllocation::MappedMemoryRange range = {}; + ILogicalDevice::MappedMemoryRange range = {}; range.memory = mem; range.offset = core::alignDown(off, nonCoherentAtomSize); range.length = core::min(core::alignUp(len, nonCoherentAtomSize), mem->getAllocationSize()); return range; } +#if 0 // TODO: port //! Internal tool used to patch command buffers in submit info. class CSubmitInfoPatcher { @@ -820,16 +818,18 @@ class NBL_API2 IUtilities : public core::IReferenceCounted core::vector m_allCommandBuffers; core::smart_refctd_ptr m_newCommandBuffer; // if necessary, then need to hold reference to. }; - +#endif core::smart_refctd_ptr m_device; core::smart_refctd_ptr > m_defaultDownloadBuffer; core::smart_refctd_ptr > m_defaultUploadBuffer; +#if 0 // TODO: port core::smart_refctd_ptr m_propertyPoolHandler; core::smart_refctd_ptr m_scanner; -}; #endif +}; + class ImageRegionIterator { public: From 04689b9924898146250254883b89a272105e8ad2 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Tue, 5 Dec 2023 23:31:52 +0300 Subject: [PATCH 30/62] device cap traits --- .../hlsl/device_capabilities_traits.hlsl | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl b/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl index b56fc9a557..e7263fd062 100644 --- a/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl +++ b/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl @@ -7,15 +7,44 @@ #include #ifdef __HLSL_VERSION + +NBL_GENERATE_MEMBER_TESTER(shaderFloat64); +NBL_GENERATE_MEMBER_TESTER(shaderDrawParameters); +NBL_GENERATE_MEMBER_TESTER(subgroupArithmetic); +NBL_GENERATE_MEMBER_TESTER(fragmentShaderPixelInterlock); +NBL_GENERATE_MEMBER_TESTER(maxOptimallyResidentWorkgroupInvocations); + +#define NBL_GENERATE_GET_OR_DEFAULT(field, ty, default) \ +template::value> struct get_or_default_##field : integral_constant {}; \ +template struct get_or_default_##field : integral_constant {}; + namespace nbl { namespace hlsl { + +namespace impl +{ +NBL_GENERATE_GET_OR_DEFAULT(shaderFloat64, bool, false); +NBL_GENERATE_GET_OR_DEFAULT(shaderDrawParameters, bool, false); +NBL_GENERATE_GET_OR_DEFAULT(subgroupArithmetic, bool, false); +NBL_GENERATE_GET_OR_DEFAULT(fragmentShaderPixelInterlock, bool, false); +NBL_GENERATE_GET_OR_DEFAULT(maxOptimallyResidentWorkgroupInvocations, uint16_t, 0); +} + + template struct device_capabilities_traits { - // TODO: check for members and default them to sane things, only do the 5 members in CJITIncludeLoader.cpp struct, we'll do the rest on `vulkan_1_3` branch with Nahim + NBL_CONSTEXPR_STATIC_INLINE bool shaderFloat64 = impl::get_or_default_shaderFloat64::value; + NBL_CONSTEXPR_STATIC_INLINE bool shaderDrawParameters = impl::get_or_default_shaderDrawParameters::value; + NBL_CONSTEXPR_STATIC_INLINE bool subgroupArithmetic = impl::get_or_default_subgroupArithmetic::value; + NBL_CONSTEXPR_STATIC_INLINE bool fragmentShaderPixelInterlock = impl::get_or_default_fragmentShaderPixelInterlock::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t maxOptimallyResidentWorkgroupInvocations = impl::get_or_default_maxOptimallyResidentWorkgroupInvocations::value; }; + +#undef NBL_GENERATE_GET_OR_DEFAULT + } } #endif From 4a17eafbd5e52143fdfd7ac7aebc00dca55e9165 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Wed, 6 Dec 2023 01:09:00 +0300 Subject: [PATCH 31/62] port macros to boost pp --- .../nbl/builtin/hlsl/member_test_macros.hlsl | 72 ++++++------------- 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl index 021be424ce..f9c46cc98e 100644 --- a/include/nbl/builtin/hlsl/member_test_macros.hlsl +++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl @@ -5,6 +5,7 @@ #define _NBL_BUILTIN_HLSL_MEMBER_TEST_MACROS_INCLUDED_ #include +#include #ifdef __HLSL_VERSION @@ -74,60 +75,31 @@ NBL_GENERATE_MEMBER_TESTER(z) NBL_GENERATE_MEMBER_TESTER(w) -// Even though it should work for some reason tests fail -// proof it works : https://godbolt.org/z/EzPWGnTPb +#define NBL_REPEAT(fn, n) BOOST_PP_REPEAT(n, fn, n) -#define CAT(x, y) x##y -#define TYPE_DECLARE(n) typename Arg##n -#define TYPE_DECLARE_DEFAULT(n) TYPE_DECLARE(n)=void -#define TYPE_FWD(n) Arg##n -#define DECLVAL_DECLARE(n) impl::declval() +#define NBL_TYPE_DECLARE(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n +#define NBL_TYPE_DECLARE_DEFAULT(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n=void +#define NBL_TYPE_FWD(z, n, x) BOOST_PP_COMMA_IF(x) Arg##n +#define NBL_DECLVAL_DECLARE(z, n, x) impl::declval() BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(BOOST_PP_INC(n), x)) -#define FOR_EACH0(fn) -#define FOR_EACH1(fn) fn(1) -#define FOR_EACH2(fn) fn(2), FOR_EACH1(fn) -#define FOR_EACH3(fn) fn(3), FOR_EACH2(fn) -#define FOR_EACH4(fn) fn(4), FOR_EACH3(fn) -#define FOR_EACH(fn, n) CAT(FOR_EACH, n)(fn) - -#define GENERATE_STATIC_METHOD_TESTER_SPEC0(x) \ -template \ -struct has_static_method_##x::type> : true_type \ -{ \ - using return_type = decltype(T::x()); \ - NBL_CONSTEXPR_STATIC_INLINE uint arg_count = 0; \ -}; - -#define GENERATE_STATIC_METHOD_TESTER_SPEC(x, n) \ -template \ -struct has_static_method_##x::type> : true_type \ +#define GENERATE_STATIC_METHOD_TESTER_SPEC(z, n, x) \ +template \ +struct has_static_method_##x::type> : true_type \ { \ - using return_type = decltype(T::x(FOR_EACH(DECLVAL_DECLARE, n))); \ + using return_type = decltype(T::x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n))); \ NBL_CONSTEXPR_STATIC_INLINE uint arg_count = n; \ }; -#define GENERATE_STATIC_METHOD_TESTER(x) \ -template \ +#define GENERATE_STATIC_METHOD_TESTER(x, n) \ +template \ struct has_static_method_##x : false_type {}; \ -GENERATE_STATIC_METHOD_TESTER_SPEC0(x) \ -GENERATE_STATIC_METHOD_TESTER_SPEC(x, 1) \ -GENERATE_STATIC_METHOD_TESTER_SPEC(x, 2) \ -GENERATE_STATIC_METHOD_TESTER_SPEC(x, 3) \ -GENERATE_STATIC_METHOD_TESTER_SPEC(x, 4) - -#define GENERATE_METHOD_TESTER_SPEC0(x) \ -template \ -struct has_method_##x().x())>::type> : impl::if_2_else_1::value> \ -{ \ - using return_type = decltype(impl::declval().x()); \ - NBL_CONSTEXPR_STATIC_INLINE uint arg_count = 0; \ -}; +BOOST_PP_REPEAT(n, GENERATE_STATIC_METHOD_TESTER_SPEC, x) -#define GENERATE_METHOD_TESTER_SPEC(x, n) \ -template \ -struct has_method_##x().x(FOR_EACH(DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1::value> \ +#define GENERATE_METHOD_TESTER_SPEC(z, n, x) \ +template \ +struct has_method_##x().x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1::value> \ { \ - using return_type = decltype(impl::declval().x(FOR_EACH(DECLVAL_DECLARE, n))); \ + using return_type = decltype(impl::declval().x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n))); \ NBL_CONSTEXPR_STATIC_INLINE uint arg_count = n; \ }; @@ -147,14 +119,10 @@ struct has_method_##x \ +namespace impl { GENERATE_STATIC_METHOD_TESTER(x, 4) } \ +template \ struct has_method_##x : false_type {}; \ -GENERATE_METHOD_TESTER_SPEC0(x) \ -GENERATE_METHOD_TESTER_SPEC(x, 1) \ -GENERATE_METHOD_TESTER_SPEC(x, 2) \ -GENERATE_METHOD_TESTER_SPEC(x, 3) \ -GENERATE_METHOD_TESTER_SPEC(x, 4) \ +BOOST_PP_REPEAT(4, GENERATE_METHOD_TESTER_SPEC, x) \ }} From 5fcad02dc0e29074f9eeb652e365299a3fc7fe2e Mon Sep 17 00:00:00 2001 From: atkurtul Date: Wed, 6 Dec 2023 02:10:47 +0300 Subject: [PATCH 32/62] has_member_x_with_type --- include/nbl/builtin/hlsl/member_test_macros.hlsl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl index f9c46cc98e..cbceb00c9e 100644 --- a/include/nbl/builtin/hlsl/member_test_macros.hlsl +++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl @@ -59,12 +59,13 @@ struct is_static_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_con template \ struct is_static_member_##a::value,void>::type>: is_const_helper {}; \ template \ -struct is_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false;}; \ +struct is_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false; using type = void; }; \ template \ struct is_member_##a().a),void>::value,void>::type> : is_const_helper().a), true>{}; \ } \ template \ struct has_member_##a { NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a::value + impl::is_static_member_##a::value + impl::is_static_member_##a::is_constant); }; \ +template struct has_member_##a##_with_type : bool_constant::value && is_same::type, F>::value> {}; \ } \ } From 3c97ef16f49d821154f10199add05da58e106519 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Wed, 6 Dec 2023 02:40:57 +0300 Subject: [PATCH 33/62] make e_member_presence bitflags --- .../nbl/builtin/hlsl/member_test_macros.hlsl | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl index cbceb00c9e..36384f46dc 100644 --- a/include/nbl/builtin/hlsl/member_test_macros.hlsl +++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl @@ -17,19 +17,11 @@ namespace hlsl namespace impl { -template -struct is_const_helper : bool_constant -{ - using type = T; - NBL_CONSTEXPR_STATIC_INLINE bool is_constant = is_const::value; -}; - enum e_member_presence { - absent = 0, - non_static = 1, - as_static = 2, - static_constexpr = 3, + is_present = 1<<0, + is_static = 1<<1, + is_const = 1<<2, }; template @@ -55,16 +47,16 @@ namespace hlsl \ { \ namespace impl { \ template \ -struct is_static_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false; }; \ +struct is_static_member_##a: false_type { }; \ template \ -struct is_static_member_##a::value,void>::type>: is_const_helper {}; \ +struct is_static_member_##a::value,void>::type> : true_type { }; \ template \ -struct is_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false; using type = void; }; \ +struct is_member_##a: false_type { using type = void; }; \ template \ -struct is_member_##a().a),void>::value,void>::type> : is_const_helper().a), true>{}; \ +struct is_member_##a().a),void>::value,void>::type> : true_type { using type = decltype(declval().a); }; \ } \ template \ -struct has_member_##a { NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a::value + impl::is_static_member_##a::value + impl::is_static_member_##a::is_constant); }; \ +struct has_member_##a { NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a::value + 2*impl::is_static_member_##a::value + 4*is_const::type>::value); }; \ template struct has_member_##a##_with_type : bool_constant::value && is_same::type, F>::value> {}; \ } \ } From 06b43afe35054b35078795c67f372b4f6f60c36e Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 10 Jan 2024 15:43:36 +0100 Subject: [PATCH 34/62] Use new inline SPIR-V builtin syntax from DXC --- .../nbl/builtin/hlsl/glsl_compat/core.hlsl | 12 +++--- .../hlsl/glsl_compat/subgroup_ballot.hlsl | 38 +++---------------- .../hlsl/glsl_compat/subgroup_basic.hlsl | 23 +++-------- include/nbl/builtin/hlsl/macros.h | 2 +- .../builtin/hlsl/spirv_intrinsics/core.hlsl | 16 ++++++++ .../spirv_intrinsics/subgroup_ballot.hlsl | 11 ++++++ .../hlsl/spirv_intrinsics/subgroup_basic.hlsl | 9 ++++- 7 files changed, 55 insertions(+), 56 deletions(-) diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 3b485ecdd7..92691fdb24 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -60,12 +60,14 @@ T atomicCompSwap(NBL_REF_ARG(T) ptr, T comparator, T value) * For Compute Shaders */ -// TODO (Future): Its annoying we have to forward declare those, but accessing gl_NumSubgroups and other gl_* values is not yet possible due to https://github.com/microsoft/DirectXShaderCompiler/issues/4217 -// also https://github.com/microsoft/DirectXShaderCompiler/issues/5280 -uint32_t gl_LocalInvocationIndex(); +// TODO: Extemely annoying that HLSL doesn't have referencies, so we can't transparently alias the variables as `const&` :( +uint32_t3 gl_NumWorkGroups() {return spirv::NumWorkGroups;} +// TODO: DXC BUG prevents us from defining this! uint32_t3 gl_WorkGroupSize(); -uint32_t3 gl_GlobalInvocationID(); -uint32_t3 gl_WorkGroupID(); +uint32_t3 gl_WorkGroupID() {return spirv::WorkgroupId;} +uint32_t3 gl_LocalInvocationID() {return spirv::LocalInvocationId;} +uint32_t3 gl_GlobalInvocationID() {return spirv::GlobalInvocationId;} +uint32_t gl_LocalInvocationIndex() {return spirv::LocalInvocationIndex;} void barrier() { spirv::controlBarrier(spv::ScopeWorkgroup, spv::ScopeWorkgroup, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask); diff --git a/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl b/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl index 528b523d9a..ecd888ae2c 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl @@ -14,38 +14,12 @@ namespace hlsl namespace glsl { -uint32_t4 gl_SubgroupEqMask() -{ - const uint32_t comp = gl_SubgroupInvocationID()>>5; - uint32_t4 retval = uint32_t4(0,0,0,0); - retval[comp] = 0x1u<<(gl_SubgroupInvocationID()&31u); - return retval; -} - -uint32_t4 gl_SubgroupGeMask() -{ - const uint32_t FullBits = 0xffffffffu; - const uint32_t comp = gl_SubgroupInvocationID()>>5; - uint32_t4 retval = uint32_t4(comp>0 ? 0u:FullBits,comp>1 ? 0u:FullBits,comp>2 ? 0u:FullBits,0u); - retval[comp] = FullBits<<(gl_SubgroupInvocationID()&31u); - return retval; -} - -uint32_t4 gl_SubgroupGtMask() -{ - uint32_t4 retval = gl_SubgroupGeMask(); - const uint32_t comp = gl_SubgroupInvocationID()>>5; - retval[comp] = 0xfffffffeu<<(gl_SubgroupInvocationID()&31u); - return retval; -} - -uint32_t4 gl_SubgroupLeMask() { - return ~gl_SubgroupGtMask(); -} - -uint32_t4 gl_SubgroupLtMask() { - return ~gl_SubgroupGeMask(); -} +// TODO: Extemely annoying that HLSL doesn't have referencies, so we can't transparently alias the variables as `const&` :( +uint32_t4 gl_SubgroupEqMask() {return spirv::BuiltInSubgroupEqMask;} +uint32_t4 gl_SubgroupGeMask() {return spirv::BuiltInSubgroupGeMask;} +uint32_t4 gl_SubgroupGtMask() {return spirv::BuiltInSubgroupGtMask;} +uint32_t4 gl_SubgroupLeMask() {return spirv::BuiltInSubgroupLeMask;} +uint32_t4 gl_SubgroupLtMask() {return spirv::BuiltInSubgroupLtMask;} template T subgroupBroadcastFirst(T value) diff --git a/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl b/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl index c7feacef6f..b7dc990aa4 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl @@ -13,25 +13,15 @@ namespace hlsl { namespace glsl { - #ifdef __HLSL_VERSION -uint32_t gl_SubgroupSize() { - return WaveGetLaneCount(); -} - -uint32_t gl_SubgroupSizeLog2() { - return firstbithigh(gl_SubgroupSize()); -} - -uint32_t gl_SubgroupInvocationID() { - return WaveGetLaneIndex(); -} +// TODO: Extemely annoying that HLSL doesn't have referencies, so we can't transparently alias the variables as `const&` :( +uint32_t gl_SubgroupSize() {return spirv::SubgroupSize;} +uint32_t gl_SubgroupSizeLog2() {return firstbithigh(spirv::SubgroupSize);} +uint32_t gl_SubgroupInvocationID() {return spirv::SubgroupLocalInvocationId;} // only available in compute -uint32_t gl_SubgroupID() { - // TODO (PentaKon): This is not always correct (subgroup IDs aren't always aligned with invocation index per the spec) - return gl_LocalInvocationIndex() >> gl_SubgroupSizeLog2(); -} +uint32_t gl_NumSubgroups() {return spirv::NumSubgroups;} +uint32_t gl_SubgroupID() {return spirv::SubgroupId;} bool subgroupElect() { return spirv::subgroupElect(spv::ScopeSubgroup); @@ -57,7 +47,6 @@ void subgroupMemoryBarrierImage() { spirv::memoryBarrier(spv::ScopeSubgroup, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsImageMemoryMask); } #endif - } } } diff --git a/include/nbl/builtin/hlsl/macros.h b/include/nbl/builtin/hlsl/macros.h index b48f90eb2f..c9f08738cb 100644 --- a/include/nbl/builtin/hlsl/macros.h +++ b/include/nbl/builtin/hlsl/macros.h @@ -29,7 +29,7 @@ #define NBL_ARG_125(a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25,a26,a27,a28,a29,a30,a31,a32,a33,a34,a35,a36,a37,a38,a39,a40,a41,a42,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52,a53,a54,a55,a56,a57,a58,a59,a60,a61,a62,a63,a64,a65,a66,a67,a68,a69,a70,a71,a72,a73,a74,a75,a76,a77,a78,a79,a80,a81,a82,a83,a84,a85,a86,a87,a88,a89,a90,a91,a92,a93,a94,a95,a96,a97,a98,a99,a100,a101,a102,a103,a104,a105,a106,a107,a108,a109,a110,a111,a112,a113,a114,a115,a116,a117,a118,a119,a120,a121,a122,a123,a124,a125, ... ) a125 #define NBL_VA_ARGS_COUNT( ... ) NBL_EVAL(NBL_ARG_125(__VA_ARGS__,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) -// +// TODO: Use BOOST_PP! #define NBL_FOREACH_0(WHAT) #define NBL_FOREACH_1(WHAT, X) NBL_EVAL(WHAT(X)) #define NBL_FOREACH_2(WHAT, X, ...) NBL_EVAL(WHAT(X)NBL_FOREACH_1(WHAT, __VA_ARGS__)) diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index e202118e8b..1380355669 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -18,6 +18,22 @@ namespace hlsl #ifdef __HLSL_VERSION namespace spirv { +[[vk::ext_builtin_input(spv::BuiltInHelperInvocation)]] +static const bool HelperInvocation; + +[[vk::ext_builtin_input(spv::BuiltInNumWorkgroups)]] +static const uint32_t3 NumWorkGroups; +// TODO: Doesn't work, find out why and file issue on DXC! +//[[vk::ext_builtin_input(spv::BuiltInWorkgroupSize)]] +//static const uint32_t3 WorkgroupSize; +[[vk::ext_builtin_input(spv::BuiltInWorkgroupId)]] +static const uint32_t3 WorkgroupId; +[[vk::ext_builtin_input(spv::BuiltInLocalInvocationId)]] +static const uint32_t3 LocalInvocationId; +[[vk::ext_builtin_input(spv::BuiltInGlobalInvocationId)]] +static const uint32_t3 GlobalInvocationId; +[[vk::ext_builtin_input(spv::BuiltInLocalInvocationIndex)]] +static const uint32_t LocalInvocationIndex; template T atomicAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl index cd25c18af7..64c696d3f9 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl @@ -15,6 +15,17 @@ namespace hlsl { namespace spirv { +[[vk::ext_builtin_input(spv::BuiltInSubgroupEqMask)]] +static const uint32_t4 BuiltInSubgroupEqMask; +[[vk::ext_builtin_input(spv::BuiltInSubgroupGeMask)]] +static const uint32_t4 BuiltInSubgroupGeMask; +[[vk::ext_builtin_input(spv::BuiltInSubgroupGtMask)]] +static const uint32_t4 BuiltInSubgroupGtMask; +[[vk::ext_builtin_input(spv::BuiltInSubgroupLeMask)]] +static const uint32_t4 BuiltInSubgroupLeMask; +[[vk::ext_builtin_input(spv::BuiltInSubgroupLtMask)]] +static const uint32_t4 BuiltInSubgroupLtMask; + template [[vk::ext_capability( spv::CapabilityGroupNonUniformBallot )]] [[vk::ext_instruction( spv::OpGroupNonUniformBroadcastFirst )]] diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl index 0149f4737b..08d493b87a 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl @@ -14,10 +14,17 @@ namespace hlsl { namespace spirv { +[[vk::ext_builtin_input(spv::BuiltInSubgroupSize)]] +static const uint32_t SubgroupSize; +[[vk::ext_builtin_input(spv::BuiltInNumSubgroups)]] +static const uint32_t NumSubgroups; +[[vk::ext_builtin_input(spv::BuiltInSubgroupId)]] +static const uint32_t SubgroupId; +[[vk::ext_builtin_input(spv::BuiltInSubgroupLocalInvocationId)]] +static const uint32_t SubgroupLocalInvocationId; [[vk::ext_instruction( spv::OpGroupNonUniformElect )]] bool subgroupElect(uint32_t executionScope); - } } } From fd73e2802f1f667e1bf527127564de89e31e8488 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 10:53:45 +0100 Subject: [PATCH 35/62] const correctness on surface capabilities --- include/nbl/video/utilities/SPhysicalDeviceFilter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/utilities/SPhysicalDeviceFilter.h b/include/nbl/video/utilities/SPhysicalDeviceFilter.h index ff3b9b743c..ef5d15661d 100644 --- a/include/nbl/video/utilities/SPhysicalDeviceFilter.h +++ b/include/nbl/video/utilities/SPhysicalDeviceFilter.h @@ -56,11 +56,11 @@ namespace nbl::video // See vkGetPhysicalDeviceSurfaceSupportKHR struct SurfaceCompatibility { - ISurface* surface = nullptr; + const ISurface* surface = nullptr; // Setting this to `EQF_NONE` means it sufffices to find any queue family that can present to this surface, regardless of flags it might have core::bitflag presentationQueueFlags = IQueue::FAMILY_FLAGS::NONE; }; - SurfaceCompatibility* requiredSurfaceCompatibilities = nullptr; + const SurfaceCompatibility* requiredSurfaceCompatibilities = nullptr; uint32_t requiredSurfaceCompatibilitiesCount = 0u; From 153dd21fa03d0a612a990cc27534d81839a9019c Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 12:25:50 +0100 Subject: [PATCH 36/62] 3D Blit test case was failing because of unimplemented functions for the R11G11B10_UFLOAT format, but there are still errors --- include/nbl/asset/format/EFormat.h | 91 ++++++++++++++----------- include/nbl/asset/format/encodePixels.h | 1 + 2 files changed, 53 insertions(+), 39 deletions(-) diff --git a/include/nbl/asset/format/EFormat.h b/include/nbl/asset/format/EFormat.h index 1ca7b97dd4..bd91db1d17 100644 --- a/include/nbl/asset/format/EFormat.h +++ b/include/nbl/asset/format/EFormat.h @@ -1762,18 +1762,28 @@ inline value_type getFormatMaxValue(E_FORMAT format, uint32_t channel) { switch (format) { - case EF_BC6H_SFLOAT_BLOCK: return 32767; - case EF_BC6H_UFLOAT_BLOCK: return 65504; - default: break; + case EF_B10G11R11_UFLOAT_PACK32: + if (channel<=1) + return 65520; + else if (channel==2) + return 65504; + break; + case EF_E5B9G9R9_UFLOAT_PACK32: + if (channel<3) + return 32704; + break; + case EF_BC6H_SFLOAT_BLOCK: return 32767; + case EF_BC6H_UFLOAT_BLOCK: return 65504; + default: break; } auto bytesPerChannel = (getBytesPerPixel(format)*core::rational(1,getFormatChannelCount(format))).getIntegerApprox(); switch (bytesPerChannel) { - case 2u: return 65504; - case 4u: return FLT_MAX; - case 8u: return DBL_MAX; - default: break; + case 2u: return 65504; + case 4u: return FLT_MAX; + case 8u: return DBL_MAX; + default: break; } } return 0; @@ -1882,44 +1892,47 @@ inline value_type getFormatPrecision(E_FORMAT format, uint32_t channel, value_ty else if (isFloatingPointFormat(format)) { switch (format) - { - case EF_B10G11R11_UFLOAT_PACK32: { // unsigned values are always ordered as + 1 - float f = std::abs(static_cast(value)); - int bitshft = channel == 2u ? 6 : 5; - - uint16_t f16 = core::Float16Compressor::compress(f); - uint16_t enc = f16 >> bitshft; - uint16_t next_f16 = (enc + 1) << bitshft; - - return core::Float16Compressor::decompress(next_f16) - f; - } - case EF_E5B9G9R9_UFLOAT_PACK32: - return 0; //TODO - default: break; + case EF_B10G11R11_UFLOAT_PACK32: [[fallthrough]]; + case EF_E5B9G9R9_UFLOAT_PACK32: // TODO: probably need to change signature and take all values? + { + float f = std::abs(static_cast(value)); + int bitshift; + if (format==EF_B10G11R11_UFLOAT_PACK32) + bitshift = channel==2u ? 6:5; + else + bitshift = 4; + + uint16_t f16 = core::Float16Compressor::compress(f); + uint16_t enc = f16 >> bitshift; + uint16_t next_f16 = (enc + 1) << bitshift; + + return core::Float16Compressor::decompress(next_f16) - f; + } + default: break; } auto bytesPerChannel = (getBytesPerPixel(format)*core::rational(1,getFormatChannelCount(format))).getIntegerApprox(); switch (bytesPerChannel) { - case 2u: - { - float f = std::abs(static_cast(value)); - uint16_t f16 = core::Float16Compressor::compress(f); - uint16_t dir = core::Float16Compressor::compress(2.f*(f+1.f)); - return core::Float16Compressor::decompress( core::nextafter16(f16, dir) ) - f; - } - case 4u: - { - float f32 = std::abs(static_cast(value)); - return core::nextafter32(f32,2.f*(f32+1.f))-f32; - } - case 8u: - { - double f64 = std::abs(static_cast(value)); - return core::nextafter64(f64,2.0*(f64+1.0))-f64; - } - default: break; + case 2u: + { + float f = std::abs(static_cast(value)); + uint16_t f16 = core::Float16Compressor::compress(f); + uint16_t dir = core::Float16Compressor::compress(2.f*(f+1.f)); + return core::Float16Compressor::decompress( core::nextafter16(f16, dir) ) - f; + } + case 4u: + { + float f32 = std::abs(static_cast(value)); + return core::nextafter32(f32,2.f*(f32+1.f))-f32; + } + case 8u: + { + double f64 = std::abs(static_cast(value)); + return core::nextafter64(f64,2.0*(f64+1.0))-f64; + } + default: break; } } diff --git a/include/nbl/asset/format/encodePixels.h b/include/nbl/asset/format/encodePixels.h index 2db1c08bcb..293bad884f 100644 --- a/include/nbl/asset/format/encodePixels.h +++ b/include/nbl/asset/format/encodePixels.h @@ -2488,6 +2488,7 @@ namespace asset inp >>= 52; inp &= 0x7ffull; inp -= (1023ull - 15ull); + // TODO: this is wrong, need to get maximum exponent across all 3 input values exp = (static_cast(inp) << 27); } for (uint32_t i = 0u; i < 3u; ++i) From bc7e24de81d850b1ce991516c0494081aecc92c1 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 13:23:14 +0100 Subject: [PATCH 37/62] Make the SPhysicalDeviceFilter use spans for requirement arrays. Adjust working examples accordingly Also correct bad DXC merge --- .../video/utilities/SPhysicalDeviceFilter.h | 61 ++++++++----------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/include/nbl/video/utilities/SPhysicalDeviceFilter.h b/include/nbl/video/utilities/SPhysicalDeviceFilter.h index ef5d15661d..d502be8a79 100644 --- a/include/nbl/video/utilities/SPhysicalDeviceFilter.h +++ b/include/nbl/video/utilities/SPhysicalDeviceFilter.h @@ -23,8 +23,7 @@ namespace nbl::video size_t size = 0ull; core::bitflag memoryFlags = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; }; - const MemoryRequirement* memoryRequirements = nullptr; - uint32_t memoryRequirementsCount = 0u; + std::span memoryRequirements = {}; struct QueueRequirement { @@ -49,8 +48,7 @@ namespace nbl::video // family's transfer granularity needs to be <= asset::VkExtent3D maxImageTransferGranularity = {0x80000000u,0x80000000u,0x80000000u}; }; - const QueueRequirement* queueRequirements = nullptr; - uint32_t queueRequirementsCount = 0u; + std::span queueRequirements = {}; // To determine whether a queue family of a physical device supports presentation to a given surface // See vkGetPhysicalDeviceSurfaceSupportKHR @@ -60,8 +58,7 @@ namespace nbl::video // Setting this to `EQF_NONE` means it sufffices to find any queue family that can present to this surface, regardless of flags it might have core::bitflag presentationQueueFlags = IQueue::FAMILY_FLAGS::NONE; }; - const SurfaceCompatibility* requiredSurfaceCompatibilities = nullptr; - uint32_t requiredSurfaceCompatibilitiesCount = 0u; + std::span requiredSurfaceCompatibilities = {}; // sift through multiple devices @@ -120,28 +117,24 @@ namespace nbl::video return false; // Surface Compatibility - if (requiredSurfaceCompatibilities != nullptr) + for (const auto& requiredSurfaceCompatibility : requiredSurfaceCompatibilities) { - for (uint32_t i = 0u; i < requiredSurfaceCompatibilitiesCount; ++i) - { - const auto& requiredSurfaceCompatibility = requiredSurfaceCompatibilities[i]; - if (requiredSurfaceCompatibility.surface == nullptr) - continue; // we don't care about compatibility with a nullptr surface :) + if (requiredSurfaceCompatibility.surface == nullptr) + continue; // we don't care about compatibility with a nullptr surface :) - const auto& queueFamilyProperties = physicalDevice->getQueueFamilyProperties(); - - bool physicalDeviceSupportsSurfaceWithQueueFlags = false; - for (uint32_t qfam = 0u; qfam < queueFamilyProperties.size(); ++qfam) - { - const auto& familyProperty = queueFamilyProperties[qfam]; - if(familyProperty.queueFlags.hasFlags(requiredSurfaceCompatibility.presentationQueueFlags)) - if(requiredSurfaceCompatibility.surface->isSupportedForPhysicalDevice(physicalDevice, qfam)) - physicalDeviceSupportsSurfaceWithQueueFlags = true; - } - - if(!physicalDeviceSupportsSurfaceWithQueueFlags) - return false; + const auto& queueFamilyProperties = physicalDevice->getQueueFamilyProperties(); + + bool physicalDeviceSupportsSurfaceWithQueueFlags = false; + for (uint32_t qfam = 0u; qfam < queueFamilyProperties.size(); ++qfam) + { + const auto& familyProperty = queueFamilyProperties[qfam]; + if(familyProperty.queueFlags.hasFlags(requiredSurfaceCompatibility.presentationQueueFlags)) + if(requiredSurfaceCompatibility.surface->isSupportedForPhysicalDevice(physicalDevice, qfam)) + physicalDeviceSupportsSurfaceWithQueueFlags = true; } + + if(!physicalDeviceSupportsSurfaceWithQueueFlags) + return false; } // Memory Requirements Checking: @@ -155,25 +148,23 @@ namespace nbl::video } // over-estimation, Not exact // TODO: Exact or Better Logic -> try find a feasible fitting of requirements into heaps. - for (uint32_t m = 0; m < memoryRequirementsCount; ++m) + for (const auto& req : memoryRequirements) { - size_t memSize = memoryRequirements[m].size; - for (uint32_t h = 0; h < memoryProps.memoryHeapCount; ++h) - if (heapFlags[h].hasFlags(memoryRequirements[m].memoryFlags)) - memSize = (memoryProps.memoryHeaps[h].size > memSize) ? 0ull : memSize - memoryProps.memoryHeaps[h].size; - if (memSize > 0) + size_t memSize = req.size; + for (uint32_t h=0; hmemSize ? 0ull:(memSize-memoryProps.memoryHeaps[h].size); + if (memSize>0) return false; } // Queue Requirements Checking: // over-estimation, Not exact // TODO: Exact or Better Logic -> try find a feasible fitting of requirements into queue families. - for (uint32_t q = 0; q < queueRequirementsCount; ++q) + for (const auto& queueReqs : queueRequirements) { - const auto& queueReqs = queueRequirements[q]; uint32_t queueCount = queueReqs.queueCount; - - for (uint32_t qfam = 0; qfam < queueProps.size(); ++qfam) + for (uint32_t qfam=0; qfam Date: Fri, 12 Jan 2024 14:48:45 +0100 Subject: [PATCH 38/62] ok so I found out that renderdoc hates External memory --- include/nbl/video/SPhysicalDeviceLimits.h | 2 +- src/nbl/video/CVulkanPhysicalDevice.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h index 7f58a67443..c3e13f145b 100644 --- a/include/nbl/video/SPhysicalDeviceLimits.h +++ b/include/nbl/video/SPhysicalDeviceLimits.h @@ -332,7 +332,7 @@ struct SPhysicalDeviceLimits /* VK_EXT_external_memory_host */ /* ExternalMemoryHostPropertiesEXT */ - uint32_t minImportedHostPointerAlignment = 0x1u<<16u; + uint32_t minImportedHostPointerAlignment = 0x1u<<31u; /* ShaderAtomicFloatFeaturesEXT *//* VK_EXT_shader_atomic_float */ // [REQUIRE] Nabla Core Profile diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index e457ae3a2b..030890b187 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -257,7 +257,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart return availableFeatureSet.find(name)!=availableFeatureSet.end(); }; //! Required by Nabla Core Profile - if (!isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME)) + if (!rdoc && !isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME)) return nullptr; if (!isExtensionSupported(VK_EXT_SHADER_ATOMIC_FLOAT_EXTENSION_NAME)) return nullptr; @@ -294,7 +294,6 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart addToPNextChain(&vulkan13Properties); //! Required by Nabla Core Profile VkPhysicalDeviceExternalMemoryHostPropertiesEXT externalMemoryHostProperties = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT }; - addToPNextChain(&externalMemoryHostProperties); VkPhysicalDeviceRobustness2PropertiesEXT robustness2Properties = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT }; addToPNextChain(&robustness2Properties); //! Extensions (ordered by spec extension number) @@ -314,6 +313,9 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart #endif VkPhysicalDeviceShaderSMBuiltinsPropertiesNV shaderSMBuiltinsPropertiesNV = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_PROPERTIES_NV }; VkPhysicalDeviceShaderCoreProperties2AMD shaderCoreProperties2AMD = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_2_AMD }; + //! Because Renderdoc is special and instead of ignoring extensions it whitelists them + if (isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME)) + addToPNextChain(&externalMemoryHostProperties); //! This is only written for convenience to avoid getting validation errors otherwise vulkan will just skip any strutctures it doesn't recognize if (isExtensionSupported(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME)) addToPNextChain(&conservativeRasterizationProperties); @@ -504,7 +506,8 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart //! Nabla Core Extensions - properties.limits.minImportedHostPointerAlignment = externalMemoryHostProperties.minImportedHostPointerAlignment; + if (isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME)) // renderdoc special + properties.limits.minImportedHostPointerAlignment = externalMemoryHostProperties.minImportedHostPointerAlignment; // there's no ShaderAtomicFloatPropertiesEXT @@ -1398,7 +1401,7 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic extensionsToEnable.insert(VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME); // All Requirements Exist in Vulkan 1.1 (including instance extensions) #endif enableExtensionIfAvailable(VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME); - extensionsToEnable.insert(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); + enableExtensionIfAvailable(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); extensionsToEnable.insert(VK_EXT_SHADER_ATOMIC_FLOAT_EXTENSION_NAME); //! required but has overhead so conditional From b5a633a6b2d4ba113f739883e50d8dcc2cc74932 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 14:56:17 +0100 Subject: [PATCH 39/62] fix typos causing issues --- include/nbl/asset/IFramebuffer.h | 2 +- src/nbl/video/IGPUCommandBuffer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IFramebuffer.h b/include/nbl/asset/IFramebuffer.h index c2d2f21085..99295e7503 100644 --- a/include/nbl/asset/IFramebuffer.h +++ b/include/nbl/asset/IFramebuffer.h @@ -96,7 +96,7 @@ class IFramebuffer if (!attachments[i]) return true; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-commonparent - if (rp->isCompatibleDevicewise(attachments[i].get())) + if (!rp->isCompatibleDevicewise(attachments[i].get())) return true; const auto& viewParams = attachments[i]->getCreationParameters(); diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 2ede1f2c0f..0f890ffdb8 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -749,7 +749,7 @@ bool IGPUCommandBuffer::pushConstants(const IGPUPipelineLayout* const layout, co if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT)) return false; - if (!layout || this->isCompatibleDevicewise(layout)) + if (!layout || !this->isCompatibleDevicewise(layout)) return false; if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(layout))) From 2ab33eda4cb684172a9e0d891012d0cccda0d3df Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 16:44:30 +0100 Subject: [PATCH 40/62] API draft --- include/nbl/video/utilities/IUtilities.h | 141 +++++++++++++++-------- src/nbl/video/utilities/IUtilities.cpp | 7 ++ 2 files changed, 101 insertions(+), 47 deletions(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 89d45da3f7..2a97610d71 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -211,7 +211,75 @@ class NBL_API2 IUtilities : public core::IReferenceCounted updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u, params.size, core::smart_refctd_ptr(buffer)}, data, queue); return buffer; } +#endif + struct SIntendedSubmitInfo final + { + public: + inline bool valid() const + { + if (!queue || commandBuffers.empty() || signalSemaphores.empty()) + return false; + if (!getScratchCommandBuffer()->isResettable()) + return false; + if (!getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + return false; + for (const auto& info : commandBuffers) + if (info.cmdbuf->getPool()->getQueueFamilyIndex()!=queue->getFamilyIndex()) + return false; + return true; + } + + // Use the last command buffer in intendedNextSubmit, it should be in recording state + inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.back().cmdbuf;} + inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.back().cmdbuf;} + + inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};} + + inline operator IQueue::SSubmitInfo() const + { + return { + .waitSemaphores = waitSemaphores, + .commandBuffers = commandBuffers, + .signalSemaphores = signalSemaphores + }; + } + + inline void overflowSubmit() + { + auto cmdbuf = getScratchCommandBuffer(); + auto& scratchSemaphore = signalSemaphores.front(); + // but first sumbit the already buffered up copies + cmdbuf->end(); + IQueue::SSubmitInfo submit = *this; + // we only signal the last semaphore which is used as scratch + submit.signalSemaphores = {&scratchSemaphore,1}; + assert(submit.isValid()); + queue->submit({&submit,1}); + // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller + { + const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++}; + const_cast(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1}); + } + // we've already waited on the Host for the semaphores, no use waiting twice + waitSemaphores = {}; + // since all the commandbuffers have submitted already we only reuse the last one + commandBuffers = {&commandBuffers.back(),1}; + // we will still signal the same set in the future + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + } + + + IQueue* queue = {}; + std::span waitSemaphores = {}; + std::span commandBuffers = {}; + std::span signalSemaphores = {}; + + private: + friend class IUtilities; + static const char* ErrorText; + }; // -------------- // updateBufferRangeViaStagingBuffer // -------------- @@ -230,7 +298,10 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! ** The last command buffer will be used to record the copy commands //! - submissionQueue: IQueue used to submit, when needed. //! Note: This parameter is required but may not be used if there is no need to submit - //! - submissionFence: + //! - scratchSemaphore: + //! - since you've already decided on the semaphores you'll wait and signal in the `intendedNextSubmit`, we need an extra semaphore to "stich together" the submit if we split it + + //! - This is the fence you will use to submit the copies to, this allows freeing up space in stagingBuffer when the fence is signalled, indicating that the copy has finished. //! - This fence will be in `UNSIGNALED` state after exiting the function. (It will reset after each implicit submit) //! - This fence may be used for CommandBuffer submissions using `submissionQueue` inside the function. @@ -249,31 +320,26 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! * submissionFence must point to a valid IGPUFence //! * submissionFence must be in `UNSIGNALED` state //! ** IUtility::getDefaultUpStreamingBuffer()->cull_frees() should be called before reseting the submissionFence and after fence is signaled. - [[nodiscard("Use The New IQueue::SubmitInfo")]] inline IQueue::SSubmitInfo updateBufferRangeViaStagingBuffer( - const asset::SBufferRange& bufferRange, const void* data, - IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo intendedNextSubmit - ) + inline bool updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) { - if(!intendedNextSubmit.isValid() || intendedNextSubmit.commandBufferCount <= 0u) + if (!bufferRange.isValid() || !bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT)) { - // TODO: log error -> intendedNextSubmit is invalid - assert(false); - return intendedNextSubmit; + m_logger.log("Invalid `bufferRange` or buffer has no `EUF_TRANSFER_DST_BIT` usage flag, cannot `updateBufferRangeViaStagingBuffer`!", system::ILogger::ELL_ERROR); + return false; + } + + if (!nextSubmit.valid()) + { + m_logger.log(nextSubmit.ErrorText,system::ILogger::ELL_ERROR); + return false; } const auto& limits = m_device->getPhysicalDevice()->getLimits(); - const uint32_t optimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t); - - // Use the last command buffer in intendedNextSubmit, it should be in recording state - auto& cmdbuf = intendedNextSubmit.commandBuffers[intendedNextSubmit.commandBufferCount-1]; - auto* cmdpool = cmdbuf->getPool(); - assert(cmdbuf->isResettable()); - assert(cmdpool->getQueueFamilyIndex() == submissionQueue->getFamilyIndex()); - assert(cmdbuf->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)); - assert(bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT)); + const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof(uint32_t); + auto cmdbuf = nextSubmit.getScratchCommandBuffer(); // no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal - for (size_t uploadedSize = 0ull; uploadedSize < bufferRange.size;) + for (size_t uploadedSize=0ull; uploadedSize::invalid_value) { - // but first sumbit the already buffered up copies - cmdbuf->end(); - IQueue::SSubmitInfo submit = intendedNextSubmit; - submit.signalSemaphoreCount = 0u; - submit.pSignalSemaphores = nullptr; - assert(submit.isValid()); - submissionQueue->submit(1u, &submit, submissionFence); - m_device->blockForFences(1u, &submissionFence); - intendedNextSubmit.commandBufferCount = 1u; - intendedNextSubmit.commandBuffers = &cmdbuf; - intendedNextSubmit.waitSemaphoreCount = 0u; - intendedNextSubmit.pWaitSemaphores = nullptr; - intendedNextSubmit.pWaitDstStageMask = nullptr; - // before resetting we need poll all events in the allocator's deferred free list - m_defaultUploadBuffer->cull_frees(); - // we can reset the fence and commandbuffer because we fully wait for the GPU to finish here - m_device->resetFences(1u, &submissionFence); - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + nextSubmit.overflowSubmit(); continue; } // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate()) { - auto flushRange = AlignedMappedMemoryRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory(),localOffset,subSize,limits.nonCoherentAtomSize); + auto flushRange = AlignedMappedMemoryRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory().memory,localOffset,subSize,limits.nonCoherentAtomSize); m_device->flushMappedMemoryRanges(1u,&flushRange); } // after we make sure writes are in GPU memory (visible to GPU) and not still in a cache, we can copy using the GPU to device-only memory IGPUCommandBuffer::SBufferCopy copy; copy.srcOffset = localOffset; - copy.dstOffset = bufferRange.offset + uploadedSize; + copy.dstOffset = bufferRange.offset+uploadedSize; copy.size = subSize; cmdbuf->copyBuffer(m_defaultUploadBuffer.get()->getBuffer(), bufferRange.buffer.get(), 1u, ©); - // this doesn't actually free the memory, the memory is queued up to be freed only after the GPU fence/event is signalled - m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,core::smart_refctd_ptr(submissionFence),&cmdbuf); // can queue with a reset but not yet pending fence, just fine + // this doesn't actually free the memory, the memory is queued up to be freed only after the `scratchSemaphore` reaches a value a future submit will signal + m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&cmdbuf); uploadedSize += subSize; } - return intendedNextSubmit; + return true; } - +#if 0 //! This function is an specialization of the `updateBufferRangeViaStagingBuffer` function above. //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads @@ -373,7 +421,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted } //! This function is an specialization of the `updateBufferRangeViaStagingBufferAutoSubmit` function above. - //! Additionally waits for the fence + //! Additionally waits for the upload right away //! WARNING: This function blocks CPU and stalls the GPU! inline void updateBufferRangeViaStagingBufferAutoSubmit( const asset::SBufferRange& bufferRange, const void* data, @@ -391,10 +439,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted updateBufferRangeViaStagingBufferAutoSubmit(bufferRange, data, submissionQueue, fence.get(), submitInfo); m_device->blockForFences(1u, &fence.get()); } - +#endif // pipelineBarrierAutoSubmit? -#endif // -------------- // downloadBufferRangeViaStagingBuffer diff --git a/src/nbl/video/utilities/IUtilities.cpp b/src/nbl/video/utilities/IUtilities.cpp index e46aec48ad..5ad7612f1e 100644 --- a/src/nbl/video/utilities/IUtilities.cpp +++ b/src/nbl/video/utilities/IUtilities.cpp @@ -4,6 +4,13 @@ namespace nbl::video { +const char* IUtilities::SIntendedSubmitInfo::ErrorText = R"===(Invalid `IUtilities::SIntendedSubmitInfo`, possible reasons are: +- No `commandBuffers` or `signalSemaphores` given in respective spans +- `commandBuffer.back()` is not Resettable +- `commandBuffer.back()` is not already begun with ONE_TIME_SUBMIT_BIT +- one of the `commandBuffer`s' Pool's Queue Family Index doesn't match `queue`'s Family +)==="; + #if 0 // TODO: port IQueue::SSubmitInfo IUtilities::updateImageViaStagingBuffer( asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, asset::IImage::LAYOUT currentDstImageLayout, const core::SRange& regions, From bbc5aa994a133ec0b2a8cc8a6ebe1e0b01a58958 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 17:27:10 +0100 Subject: [PATCH 41/62] think about the other 3 utility functions --- include/nbl/video/utilities/IUtilities.h | 131 +++++++++++------------ 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 2a97610d71..bd4c748c7d 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -194,60 +194,52 @@ class NBL_API2 IUtilities : public core::IReferenceCounted )); return allocationSize; } - -#if 0 // TODO: port - //! WARNING: This function blocks the CPU and stalls the GPU! - inline core::smart_refctd_ptr createFilledDeviceLocalBufferOnDedMem(IQueue* queue, IGPUBuffer::SCreationParams&& params, const void* data) + + struct SFrontHalfSubmitInfo final { - if(!params.usage.hasFlags(IGPUBuffer::EUF_TRANSFER_DST_BIT)) - { - assert(false); - return nullptr; - } - auto buffer = m_device->createBuffer(std::move(params)); - auto mreqs = buffer->getMemoryReqs(); - mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto mem = m_device->allocate(mreqs, buffer.get()); - updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u, params.size, core::smart_refctd_ptr(buffer)}, data, queue); - return buffer; - } -#endif + inline bool valid() const {return queue;} + // Use the last command buffer in intendedNextSubmit, it should be in recording state + inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} + inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} + + + IQueue* queue = {}; + std::span waitSemaphores = {}; + std::span commandBuffers = {}; + }; + //! Struct meant to be used with any utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour. + //! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. + //! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` ! + //! for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied. struct SIntendedSubmitInfo final { public: inline bool valid() const { - if (!queue || commandBuffers.empty() || signalSemaphores.empty()) + if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty()) return false; - if (!getScratchCommandBuffer()->isResettable()) + if (!frontHalf.getScratchCommandBuffer()->isResettable()) return false; - if (!getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) - return false; - for (const auto& info : commandBuffers) - if (info.cmdbuf->getPool()->getQueueFamilyIndex()!=queue->getFamilyIndex()) + if (!frontHalf.getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) return false; return true; } - // Use the last command buffer in intendedNextSubmit, it should be in recording state - inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.back().cmdbuf;} - inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.back().cmdbuf;} - inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};} inline operator IQueue::SSubmitInfo() const { return { - .waitSemaphores = waitSemaphores, - .commandBuffers = commandBuffers, + .waitSemaphores = frontHalf.waitSemaphores, + .commandBuffers = frontHalf.commandBuffers, .signalSemaphores = signalSemaphores }; } inline void overflowSubmit() { - auto cmdbuf = getScratchCommandBuffer(); + auto cmdbuf = frontHalf.getScratchCommandBuffer(); auto& scratchSemaphore = signalSemaphores.front(); // but first sumbit the already buffered up copies cmdbuf->end(); @@ -255,31 +247,32 @@ class NBL_API2 IUtilities : public core::IReferenceCounted // we only signal the last semaphore which is used as scratch submit.signalSemaphores = {&scratchSemaphore,1}; assert(submit.isValid()); - queue->submit({&submit,1}); + frontHalf.queue->submit({&submit,1}); // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller { const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++}; const_cast(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1}); } // we've already waited on the Host for the semaphores, no use waiting twice - waitSemaphores = {}; + frontHalf.waitSemaphores = {}; // since all the commandbuffers have submitted already we only reuse the last one - commandBuffers = {&commandBuffers.back(),1}; + frontHalf.commandBuffers = {&frontHalf.commandBuffers.back(),1}; // we will still signal the same set in the future cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); } - IQueue* queue = {}; - std::span waitSemaphores = {}; - std::span commandBuffers = {}; + //! The last CommandBuffer will be used to record the copy commands + SFrontHalfSubmitInfo frontHalf = {}; + //! The first Semaphore will be used as a scratch, so don't use it yourself as we can advance the counter an arbitrary amount! std::span signalSemaphores = {}; private: friend class IUtilities; static const char* ErrorText; }; + // -------------- // updateBufferRangeViaStagingBuffer // -------------- @@ -287,15 +280,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer` //! If the allocation from staging memory fails due to large buffer size or fragmentation then This function may need to submit the command buffer via the `submissionQueue`. //! Returns: - //! IQueue::SSubmitInfo to use for command buffer submission instead of `intendedNextSubmit`. - //! for example: in the case the `SSubmitInfo::waitSemaphores` were already signalled, the new SSubmitInfo will have it's waitSemaphores emptied from `intendedNextSubmit`. - //! Make sure to submit with the new SSubmitInfo returned by this function + //! the number of times we overflown and had to submit, <0 [negative] on failure //! Parameters: + //! - nextSubmit: + //! Is the SubmitInfo you intended to submit your command buffers with, it will be patched if overflow occurred @see SIntendedSubmitInfo //! - bufferRange: contains offset + size into bufferRange::buffer that will be copied from `data` (offset doesn't affect how `data` is accessed) //! - data: raw pointer to data that will be copied to bufferRange::buffer - //! - intendedNextSubmit: - //! Is the SubmitInfo you intended to submit your command buffers. - //! ** The last command buffer will be used to record the copy commands //! - submissionQueue: IQueue used to submit, when needed. //! Note: This parameter is required but may not be used if there is no need to submit //! - scratchSemaphore: @@ -320,24 +310,25 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! * submissionFence must point to a valid IGPUFence //! * submissionFence must be in `UNSIGNALED` state //! ** IUtility::getDefaultUpStreamingBuffer()->cull_frees() should be called before reseting the submissionFence and after fence is signaled. - inline bool updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) + inline int64_t updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) { if (!bufferRange.isValid() || !bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT)) { m_logger.log("Invalid `bufferRange` or buffer has no `EUF_TRANSFER_DST_BIT` usage flag, cannot `updateBufferRangeViaStagingBuffer`!", system::ILogger::ELL_ERROR); - return false; + return -1; } if (!nextSubmit.valid()) { m_logger.log(nextSubmit.ErrorText,system::ILogger::ELL_ERROR); - return false; + return -1; } const auto& limits = m_device->getPhysicalDevice()->getLimits(); const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof(uint32_t); - auto cmdbuf = nextSubmit.getScratchCommandBuffer(); + auto cmdbuf = nextSubmit.frontHalf.getScratchCommandBuffer(); + int64_t overflowCounter = 0; // no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal for (size_t uploadedSize=0ull; uploadedSize::invalid_value) { nextSubmit.overflowSubmit(); + overflowCounter++; continue; } // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly @@ -380,9 +372,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&cmdbuf); uploadedSize += subSize; } - return true; + return overflowCounter; } -#if 0 + //! This function is an specialization of the `updateBufferRangeViaStagingBuffer` function above. //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads @@ -399,16 +391,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! Valid Usage: //! * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING` //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. - inline void updateBufferRangeViaStagingBufferAutoSubmit( - const asset::SBufferRange& bufferRange, const void* data, - IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo = {} - ) + inline bool updateBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) { - if(!submitInfo.isValid()) + if(!nextSubmit.frontHalf.valid()) { // TODO: log error - assert(false); - return; + return false; } CSubmitInfoPatcher submitInfoPatcher; @@ -418,28 +406,39 @@ class NBL_API2 IUtilities : public core::IReferenceCounted assert(submitInfo.isValid()); submissionQueue->submit(1u,&submitInfo,submissionFence); + return true; } //! This function is an specialization of the `updateBufferRangeViaStagingBufferAutoSubmit` function above. //! Additionally waits for the upload right away //! WARNING: This function blocks CPU and stalls the GPU! - inline void updateBufferRangeViaStagingBufferAutoSubmit( - const asset::SBufferRange& bufferRange, const void* data, - IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {} - ) + inline bool updateBufferRangeViaStagingBufferAutoSubmit(const SFrontHalfSubmitInfo& submit, const asset::SBufferRange& bufferRange, const void* data) { - if(!submitInfo.isValid()) + if(!submit.valid()) { // TODO: log error - assert(false); - return; + return false; } - auto fence = m_device->createFence(static_cast(0)); - updateBufferRangeViaStagingBufferAutoSubmit(bufferRange, data, submissionQueue, fence.get(), submitInfo); - m_device->blockForFences(1u, &fence.get()); + auto semaphore = m_device->createSemaphore(0); + if (!updateBufferRangeViaStagingBufferAutoSubmit(,bufferRange,data)) + return false; + const ISemaphore::SWaitInfo info = {semaphore.get(),1}; + m_device->blockForSemaphores({&info,1}); + return true; + } + + //! WARNING: This function blocks the CPU and stalls the GPU! + inline core::smart_refctd_ptr createFilledDeviceLocalBufferOnDedMem(const SFrontHalfSubmitInfo& submit, IGPUBuffer::SCreationParams&& params, const void* data) + { + auto buffer = m_device->createBuffer(std::move(params)); + auto mreqs = buffer->getMemoryReqs(); + mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto mem = m_device->allocate(mreqs,buffer.get()); + if (!updateBufferRangeViaStagingBufferAutoSubmit(submit,asset::SBufferRange{0u,params.size,core::smart_refctd_ptr(buffer)},data)) + return nullptr; + return buffer; } -#endif // pipelineBarrierAutoSubmit? From d41f279d50c8ea129776ea2868e4e4cccde47442 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 19:59:47 +0100 Subject: [PATCH 42/62] design clearing up --- include/nbl/video/utilities/IUtilities.h | 155 ++++++++++++----------- 1 file changed, 81 insertions(+), 74 deletions(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index bd4c748c7d..79a088ef2d 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -195,20 +195,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return allocationSize; } - struct SFrontHalfSubmitInfo final - { - inline bool valid() const {return queue;} - - // Use the last command buffer in intendedNextSubmit, it should be in recording state - inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} - inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} - - - IQueue* queue = {}; - std::span waitSemaphores = {}; - std::span commandBuffers = {}; - }; - //! Struct meant to be used with any utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour. + //! Struct meant to be used with any Utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour. //! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. //! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` ! //! for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied. @@ -219,8 +206,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted { if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty()) return false; + // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened if (!frontHalf.getScratchCommandBuffer()->isResettable()) return false; + // It makes no sense to reuse the same commands for a second submission. + // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which + // frees have already been latched on the scratch semaphore you must signal anyway. if (!frontHalf.getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) return false; return true; @@ -263,9 +254,32 @@ class NBL_API2 IUtilities : public core::IReferenceCounted } - //! The last CommandBuffer will be used to record the copy commands - SFrontHalfSubmitInfo frontHalf = {}; - //! The first Semaphore will be used as a scratch, so don't use it yourself as we can advance the counter an arbitrary amount! + //! The last CommandBuffer will be used to record the copy commands + struct SFrontHalf final + { + //! We can't check it, but all (if any) all the command buffers except the last one should be in `EXECUTABLE` state. + inline bool valid() const {return queue;} + + // Use the last command buffer in intendedNextSubmit, it should be in recording state + inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} + inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} + + // This parameter is required but may be unused if there is no need to submit + IQueue* queue = {}; + // Use this parameter to wait for previous operations to finish before whatever commands the Utility you're using records + std::span waitSemaphores = {}; + // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit, + // for example baked command buffers with pipeline barrier commands. + // .... + std::span commandBuffers = {}; + } frontHalf = {}; + //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount! + //! You can actually examine the change in `signalSemaphore.front().value` to figure out how many overflows occurred. + //! This semaphore is needed to "stitch together" additional submits if they occur so they occur before and after the original intended waits and signals. + //! We use the first semaphore to keep the intended order of original semaphore signal and waits unchanged no matter how many overflows occur. + //! You do however, NEED TO KEEP IT in the signal set of the last submit you're supposed to do manually, this allows freeing any resources used + //! after the submit is done, indicating that your streaming routine is done. + //! * Also use this parameter to signal new semaphores so that other submits know your Utility method is done. std::span signalSemaphores = {}; private: @@ -280,55 +294,34 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer` //! If the allocation from staging memory fails due to large buffer size or fragmentation then This function may need to submit the command buffer via the `submissionQueue`. //! Returns: - //! the number of times we overflown and had to submit, <0 [negative] on failure + //! True on successful recording of copy commands and handling of overflows, false on failure for any reason. //! Parameters: //! - nextSubmit: //! Is the SubmitInfo you intended to submit your command buffers with, it will be patched if overflow occurred @see SIntendedSubmitInfo //! - bufferRange: contains offset + size into bufferRange::buffer that will be copied from `data` (offset doesn't affect how `data` is accessed) //! - data: raw pointer to data that will be copied to bufferRange::buffer - //! - submissionQueue: IQueue used to submit, when needed. - //! Note: This parameter is required but may not be used if there is no need to submit - //! - scratchSemaphore: - //! - since you've already decided on the semaphores you'll wait and signal in the `intendedNextSubmit`, we need an extra semaphore to "stich together" the submit if we split it - - - //! - This is the fence you will use to submit the copies to, this allows freeing up space in stagingBuffer when the fence is signalled, indicating that the copy has finished. - //! - This fence will be in `UNSIGNALED` state after exiting the function. (It will reset after each implicit submit) - //! - This fence may be used for CommandBuffer submissions using `submissionQueue` inside the function. - //! ** NOTE: This fence will be signalled everytime there is a submission inside this function, which may be more than one until the job is finished. //! Valid Usage: + //! * nextSubmit must be valid (see `SIntendedSubmitInfo::valid()`) + //! * bufferRange must be valid (see `SBufferRange::isValid()`) //! * data must not be nullptr - //! * bufferRange should be valid (see SBufferRange::isValid()) - //! * intendedNextSubmit::commandBufferCount must be > 0 - //! * The commandBuffers should have been allocated from a CommandPool with the same queueFamilyIndex as `submissionQueue` - //! * The last command buffer should be in `RECORDING` state. - //! * The last command buffer should be must've called "begin()" with `IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT` flag - //! The reason is the commands recorded into the command buffer would not be valid for a second submission and the stagingBuffer memory wouldv'e been freed/changed. - //! * The last command buffer should be "resettable". See `ICommandBuffer::E_STATE` comments - //! * To ensure correct execution order, (if any) all the command buffers except the last one should be in `EXECUTABLE` state. - //! * submissionQueue must point to a valid IQueue - //! * submissionFence must point to a valid IGPUFence - //! * submissionFence must be in `UNSIGNALED` state - //! ** IUtility::getDefaultUpStreamingBuffer()->cull_frees() should be called before reseting the submissionFence and after fence is signaled. - inline int64_t updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) + inline bool updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) { if (!bufferRange.isValid() || !bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT)) { m_logger.log("Invalid `bufferRange` or buffer has no `EUF_TRANSFER_DST_BIT` usage flag, cannot `updateBufferRangeViaStagingBuffer`!", system::ILogger::ELL_ERROR); - return -1; + return false; } if (!nextSubmit.valid()) { m_logger.log(nextSubmit.ErrorText,system::ILogger::ELL_ERROR); - return -1; + return false; } const auto& limits = m_device->getPhysicalDevice()->getLimits(); const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof(uint32_t); auto cmdbuf = nextSubmit.frontHalf.getScratchCommandBuffer(); - int64_t overflowCounter = 0; // no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal for (size_t uploadedSize=0ull; uploadedSize::invalid_value) { nextSubmit.overflowSubmit(); - overflowCounter++; continue; } // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly @@ -372,18 +364,51 @@ class NBL_API2 IUtilities : public core::IReferenceCounted m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&cmdbuf); uploadedSize += subSize; } - return overflowCounter; + return true; } - //! This function is an specialization of the `updateBufferRangeViaStagingBuffer` function above. - //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit + //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission + //! to `intendedSubmit.queue` happening automatically, no need for the user to handle the submit at the end. //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads + //! of the `updateBufferRangeViaStagingBufferAutoSubmit` function above. //! Parameters: - //! - `submitInfo`: IQueue::SSubmitInfo used to submit the copy operations. - //! * Use this parameter to wait for previous operations to finish using submitInfo::waitSemaphores or signal new semaphores using submitInfo::signalSemaphores - //! * Fill submitInfo::commandBuffers with the commandbuffers you want to be submitted before the copy in this struct as well, for example pipeline barrier commands. - //! * Empty by default: waits for no semaphore and signals no semaphores. - //! Patches the submitInfo::commandBuffers + //! - `intendedSubmit`: more lax than regular `SIntendedSubmitInfo::valid()`, only needs a valid queue and at least one semaphore to signal (how else will you know you're done?) + //! since the submit must and will happen, there's no point updating the semaphore and commandbuffer info spans in the intendedSubmit + inline bool autoSubmit(const SIntendedSubmitInfo& intendedSubmit, const std::function& what) + { + if (!intendedSubmit.frontHalf.valid() || intendedSubmit.signalSemaphores.empty()) + { + // TODO: log error + return false; + } + + SIntendedSubmitInfo patchedSubmit = intendedSubmit; + if (!what(patchedSubmit)) + return false; + const IQueue::SSubmitInfo submit = patchedSubmit; + return intendedSubmit.frontHalf.queue->submit({&submit,1}); + } + + //! This function is an specialization of the `autoSubmit` function above, it will additionally wait on the Host (CPU) for the final submit to finish. + //! WARNING: This function blocks CPU and stalls the GPU! + inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function& what) + { + auto semaphore = m_device->createSemaphore(0); + // so we begin latching everything on the value of 1, but if we overflow it increases + IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1}; + + SIntendedSubmitInfo intendedSubmit = {.frontHalf=submit,.signalSemaphores={&info,1}}; + if (!autoSubmit(intendedSubmit,what)) + return false; + + // Watch carefully and note that we might not be waiting on the value of `1` for why @see `SIntendedSubmitInfo::signalSemaphores` + const ISemaphore::SWaitInfo waitInfo = {info.semaphore,info.value}; + m_device->blockForSemaphores({&waitInfo,1}); + return true; + } + +#if 0 + //! Patches the intendedSubmit::frontHalf::commandBuffers //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands. @@ -408,34 +433,16 @@ class NBL_API2 IUtilities : public core::IReferenceCounted submissionQueue->submit(1u,&submitInfo,submissionFence); return true; } - - //! This function is an specialization of the `updateBufferRangeViaStagingBufferAutoSubmit` function above. - //! Additionally waits for the upload right away - //! WARNING: This function blocks CPU and stalls the GPU! - inline bool updateBufferRangeViaStagingBufferAutoSubmit(const SFrontHalfSubmitInfo& submit, const asset::SBufferRange& bufferRange, const void* data) - { - if(!submit.valid()) - { - // TODO: log error - return false; - } - - auto semaphore = m_device->createSemaphore(0); - if (!updateBufferRangeViaStagingBufferAutoSubmit(,bufferRange,data)) - return false; - const ISemaphore::SWaitInfo info = {semaphore.get(),1}; - m_device->blockForSemaphores({&info,1}); - return true; - } +#endif //! WARNING: This function blocks the CPU and stalls the GPU! - inline core::smart_refctd_ptr createFilledDeviceLocalBufferOnDedMem(const SFrontHalfSubmitInfo& submit, IGPUBuffer::SCreationParams&& params, const void* data) + inline core::smart_refctd_ptr createFilledDeviceLocalBufferOnDedMem(const SIntendedSubmitInfo::SFrontHalf& submit, IGPUBuffer::SCreationParams&& params, const void* data) { auto buffer = m_device->createBuffer(std::move(params)); auto mreqs = buffer->getMemoryReqs(); mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); auto mem = m_device->allocate(mreqs,buffer.get()); - if (!updateBufferRangeViaStagingBufferAutoSubmit(submit,asset::SBufferRange{0u,params.size,core::smart_refctd_ptr(buffer)},data)) + if (!autoSubmitAndBlock(submit,[&](auto& info){return updateBufferRangeViaStagingBuffer(info,asset::SBufferRange{0u,params.size,core::smart_refctd_ptr(buffer)},data);})) return nullptr; return buffer; } From 04d05da31a29c9f1ee52f88a5dfc3a1138d019bc Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 22:17:45 +0100 Subject: [PATCH 43/62] Ok we're done here with the Streaming Buffer upload port (removed the IUtilities method triples that only did less sophisticated submits: patched and blocking) See `createFilledDeviceLocalBufferOnDedMem` for how to wrap a utility method in a lambda and achieve the same result --- include/nbl/video/IQueue.h | 3 +- include/nbl/video/utilities/IUtilities.h | 362 ++++++++++------------- 2 files changed, 158 insertions(+), 207 deletions(-) diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h index 232f4c6547..654d95a847 100644 --- a/include/nbl/video/IQueue.h +++ b/include/nbl/video/IQueue.h @@ -109,7 +109,8 @@ class IQueue : public core::Interface, public core::Unmovable virtual RESULT waitIdle() const = 0; // we cannot derive from IBackendObject because we can't derive from IReferenceCounted - inline bool wasCreatedBy(const ILogicalDevice* device) const { return device == m_originDevice; } + inline const ILogicalDevice* getOriginDevice() const {return m_originDevice;} + inline bool wasCreatedBy(const ILogicalDevice* device) const {return device==m_originDevice;} // Vulkan: const VkQueue* virtual const void* getNativeHandle() const = 0; diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 79a088ef2d..55af9a3750 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -206,13 +206,16 @@ class NBL_API2 IUtilities : public core::IReferenceCounted { if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty()) return false; + const auto* scratch = frontHalf.getScratchCommandBuffer(); // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened - if (!frontHalf.getScratchCommandBuffer()->isResettable()) + if (!scratch->isResettable()) return false; // It makes no sense to reuse the same commands for a second submission. // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which // frees have already been latched on the scratch semaphore you must signal anyway. - if (!frontHalf.getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + return false; + if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL) return false; return true; } @@ -228,6 +231,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted }; } + // One thing you might notice is that this results in a few implicit Memory and Execution Dependencies + // So there's a little bit of non-deterministic behaviour we won't fight (will not insert a barrier every time you "could-have" overflown) inline void overflowSubmit() { auto cmdbuf = frontHalf.getScratchCommandBuffer(); @@ -257,8 +262,93 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! The last CommandBuffer will be used to record the copy commands struct SFrontHalf final { - //! We can't check it, but all (if any) all the command buffers except the last one should be in `EXECUTABLE` state. - inline bool valid() const {return queue;} + //! Need a valid queue and all the command buffers except the last one should be in `EXECUTABLE` state. + inline bool valid() const + { + if (!queue) + return false; + if (!commandBuffers.empty()) + for (size_t i=0; igetState()==IGPUCommandBuffer::STATE::EXECUTABLE) + return false; + return true; + } + + //! Little class to hold the storage for the modified commandbuffer span until submission time. + class CRAIISpanPatch final : core::Uncopyable + { + public: + inline ~CRAIISpanPatch() + { + toNullify->commandBuffers = {}; + } + inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));} + inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs) + { + commandBuffersStorage = std::move(rhs.commandBuffersStorage); + return *this; + } + + inline operator bool() const {return m_recordingCommandBuffer.get();} + + private: + friend SFrontHalf; + inline CRAIISpanPatch() = default; + inline CRAIISpanPatch(SFrontHalf* _toNull) : commandBuffersStorage(_toNull->commandBuffers.size()+1), toNullify(_toNull) {} + + core::vector commandBuffersStorage; + // If we made a new commandbuffer we need to nullify the span so it doesn't point at stale mem + SFrontHalf* toNullify = nullptr; + // If new one made, then need to hold reference to it, else its just an extra ref, but whatever + core::smart_refctd_ptr m_recordingCommandBuffer; + }; + //! Patches the `commandBuffers` and then makes sure the last command buffer is resettable, in recording state begun with ONE_TIME_SUBMIT + //! If we can't make the last cmdbuffer that way, we make a new one and add it onto the end (hence the name "patching") + //! If `commandBuffers.empty()`, it will create an implicit command buffer to use for recording commands, + //! else if the last command buffer is not feasible to use as scratch for whatever reason, + //! it will add another temporary command buffer to end of `commandBuffers` and use it for recording. + //! WARNING: If patching occurs: + //! - a submission must occur before the return value goes out of scope! + //! - if `!commandBuffers.empty()`, the last CommandBuffer won't be in the same state as it was before entering the function, + //! because it needs to be `end()`ed before the submission + //! - the destructor of the return value will clear `commandBuffers` span + //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. + [[nodiscard("The RAII object returned by `patch()` provides lifetimes to your spans!")]] + inline CRAIISpanPatch patch() + { + if (auto* candidateScratch = getScratchCommandBuffer(); candidateScratch && candidateScratch->isResettable()) + switch(candidateScratch->getState()) + { + case IGPUCommandBuffer::STATE::INITIAL: + if (!candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + break; + [[fallthrough]]; + case IGPUCommandBuffer::STATE::RECORDING: + if (!candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + break; + { + CRAIISpanPatch retval; + retval.m_recordingCommandBuffer = core::smart_refctd_ptr(candidateScratch); + return retval; + } + break; + default: + break; + } + + CRAIISpanPatch retval(this); + std::copy(commandBuffers.begin(),commandBuffers.end(),retval.commandBuffersStorage.begin()); + { + auto pool = const_cast(queue->getOriginDevice())->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&retval.m_recordingCommandBuffer,1})) + return {}; + if (!retval.m_recordingCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + return {}; + retval.commandBuffersStorage.back().cmdbuf = retval.m_recordingCommandBuffer.get(); + } + commandBuffers = retval.commandBuffersStorage; + return retval; + } // Use the last command buffer in intendedNextSubmit, it should be in recording state inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} @@ -270,7 +360,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted std::span waitSemaphores = {}; // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit, // for example baked command buffers with pipeline barrier commands. - // .... + // Also remember that even though the last CommandBuffer is scratch, it you can record commands into it as well. std::span commandBuffers = {}; } frontHalf = {}; //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount! @@ -287,6 +377,61 @@ class NBL_API2 IUtilities : public core::IReferenceCounted static const char* ErrorText; }; + + //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission + //! to `intendedSubmit.queue` happening automatically, no need for the user to handle the submit at the end. + //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads + //! of the `updateBufferRangeViaStagingBufferAutoSubmit` function above. + //! Parameters: + //! - `intendedSubmit`: more lax than regular `SIntendedSubmitInfo::valid()`, only needs a valid queue and at least one semaphore to use as scratch and signal. + //! if you don't have a commandbuffer usable as scratch as the last one, we'll patch internally. + inline IQueue::RESULT autoSubmit(SIntendedSubmitInfo& intendedSubmit, const std::function& what) + { + if (!intendedSubmit.frontHalf.valid() || intendedSubmit.signalSemaphores.empty()) + { + // TODO: log error + return IQueue::RESULT::OTHER_ERROR; + } + + const auto raii = intendedSubmit.frontHalf.patch(); + if (!raii) + { + // TODO: log error + return IQueue::RESULT::OTHER_ERROR; + } + + if (!what(intendedSubmit)) + return IQueue::RESULT::OTHER_ERROR; + intendedSubmit.frontHalf.getScratchCommandBuffer()->end(); + + const IQueue::SSubmitInfo submit = intendedSubmit; + if (const auto error=intendedSubmit.frontHalf.queue->submit({&submit,1}); error!=IQueue::RESULT::SUCCESS) + return error; + // If there's any subsequent submit in a chain, make sure it waits for this one to finish + // (to achieve a command ordering in the cmdbuffer transparent to overflow submits) + intendedSubmit.frontHalf.waitSemaphores = {&intendedSubmit.signalSemaphores.front(),1}; + intendedSubmit.signalSemaphores = {}; + return IQueue::RESULT::SUCCESS; + } + + //! This function is an specialization of the `autoSubmit` function above, it will additionally wait on the Host (CPU) for the final submit to finish. + //! WARNING: This function blocks CPU and stalls the GPU! + inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function& what) + { + auto semaphore = m_device->createSemaphore(0); + // so we begin latching everything on the value of 1, but if we overflow it increases + IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1}; + + SIntendedSubmitInfo intendedSubmit = {.frontHalf=submit,.signalSemaphores={&info,1}}; + if (autoSubmit(intendedSubmit,what)!=IQueue::RESULT::SUCCESS) + return false; + + // Watch carefully and note that we might not be waiting on the value of `1` for why @see `SIntendedSubmitInfo::signalSemaphores` + const ISemaphore::SWaitInfo waitInfo = {info.semaphore,info.value}; + m_device->blockForSemaphores({&waitInfo,1}); + return true; + } + // -------------- // updateBufferRangeViaStagingBuffer // -------------- @@ -367,74 +512,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return true; } - //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission - //! to `intendedSubmit.queue` happening automatically, no need for the user to handle the submit at the end. - //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads - //! of the `updateBufferRangeViaStagingBufferAutoSubmit` function above. - //! Parameters: - //! - `intendedSubmit`: more lax than regular `SIntendedSubmitInfo::valid()`, only needs a valid queue and at least one semaphore to signal (how else will you know you're done?) - //! since the submit must and will happen, there's no point updating the semaphore and commandbuffer info spans in the intendedSubmit - inline bool autoSubmit(const SIntendedSubmitInfo& intendedSubmit, const std::function& what) - { - if (!intendedSubmit.frontHalf.valid() || intendedSubmit.signalSemaphores.empty()) - { - // TODO: log error - return false; - } - - SIntendedSubmitInfo patchedSubmit = intendedSubmit; - if (!what(patchedSubmit)) - return false; - const IQueue::SSubmitInfo submit = patchedSubmit; - return intendedSubmit.frontHalf.queue->submit({&submit,1}); - } - - //! This function is an specialization of the `autoSubmit` function above, it will additionally wait on the Host (CPU) for the final submit to finish. - //! WARNING: This function blocks CPU and stalls the GPU! - inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function& what) - { - auto semaphore = m_device->createSemaphore(0); - // so we begin latching everything on the value of 1, but if we overflow it increases - IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1}; - - SIntendedSubmitInfo intendedSubmit = {.frontHalf=submit,.signalSemaphores={&info,1}}; - if (!autoSubmit(intendedSubmit,what)) - return false; - - // Watch carefully and note that we might not be waiting on the value of `1` for why @see `SIntendedSubmitInfo::signalSemaphores` - const ISemaphore::SWaitInfo waitInfo = {info.semaphore,info.value}; - m_device->blockForSemaphores({&waitInfo,1}); - return true; - } - -#if 0 - //! Patches the intendedSubmit::frontHalf::commandBuffers - //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands. - //! WARNING: If commandBufferCount > 0, The last commandBuffer won't be in the same state as it was before entering the function, because it needs to be `end()`ed and submitted - //! Valid Usage: - //! * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING` - //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. - inline bool updateBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) - { - if(!nextSubmit.frontHalf.valid()) - { - // TODO: log error - return false; - } - - CSubmitInfoPatcher submitInfoPatcher; - submitInfoPatcher.patchAndBegin(submitInfo, m_device, submissionQueue->getFamilyIndex()); - submitInfo = updateBufferRangeViaStagingBuffer(bufferRange,data,submissionQueue,submissionFence,submitInfo); - submitInfoPatcher.end(); - - assert(submitInfo.isValid()); - submissionQueue->submit(1u,&submitInfo,submissionFence); - return true; - } -#endif - //! WARNING: This function blocks the CPU and stalls the GPU! inline core::smart_refctd_ptr createFilledDeviceLocalBufferOnDedMem(const SIntendedSubmitInfo::SFrontHalf& submit, IGPUBuffer::SCreationParams&& params, const void* data) { @@ -629,42 +706,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return intendedNextSubmit; } - //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above. - //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit - //! Parameters: - //! - `submitInfo`: IQueue::SSubmitInfo used to submit the copy operations. - //! * Use this parameter to wait for previous operations to finish using submitInfo::waitSemaphores or signal new semaphores using submitInfo::signalSemaphores - //! * Fill submitInfo::commandBuffers with the commandbuffers you want to be submitted before the copy in this struct as well, for example pipeline barrier commands. - //! * Empty by default: waits for no semaphore and signals no semaphores. - //! Patches the submitInfo::commandBuffers - //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands. - //! WARNING: If commandBufferCount > 0, The last commandBuffer won't be in the same state as it was before entering the function, because it needs to be `end()`ed and submitted - //! Valid Usage: - //! * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING` - //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. - inline void downloadBufferRangeViaStagingBufferAutoSubmit( - const std::function& consumeCallback, const asset::SBufferRange& srcBufferRange, - IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo = {} - ) - { - if (!submitInfo.isValid()) - { - // TODO: log error - assert(false); - return; - } - - CSubmitInfoPatcher submitInfoPatcher; - submitInfoPatcher.patchAndBegin(submitInfo, m_device, submissionQueue->getFamilyIndex()); - submitInfo = downloadBufferRangeViaStagingBuffer(consumeCallback, srcBufferRange, submissionQueue, submissionFence, submitInfo); - submitInfoPatcher.end(); - - assert(submitInfo.isValid()); - submissionQueue->submit(1u, &submitInfo, submissionFence); - } - //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above. //! Additionally waits for the fence //! WARNING: This function blocks CPU and stalls the GPU! @@ -686,13 +727,14 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto* fenceptr = fence.get(); m_device->blockForFences(1u, &fenceptr); - m_defaultDownloadBuffer->cull_frees(); + //! TODO: NOTE this method cannot be turned into a pure autoSubmitAndBlock + lambda because there's stuff to do AFTER the semaphore wait~! + m_defaultDownloadBuffer->cull_frees(); // its while(poll()) {} now IIRC } - +#endif // -------------- // buildAccelerationStructures // -------------- - +#if 0 // TODO: port later when we have an example //! WARNING: This function blocks the CPU and stalls the GPU! inline void buildAccelerationStructures(IQueue* queue, const core::SRange& pInfos, IGPUAccelerationStructure::BuildRangeInfo* const* ppBuildRangeInfos) { @@ -717,11 +759,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted m_device->blockForFences(1u,&fence.get()); } - +#endif // -------------- // updateImageViaStagingBuffer // -------------- - +#if 0 // TODO: port //! Copies `srcBuffer` to stagingBuffer and Records the commands needed to copy the image from stagingBuffer to `dstImage` //! If the allocation from staging memory fails due to large image size or fragmentation then This function may need to submit the command buffer via the `submissionQueue` and then signal the fence. //! Returns: @@ -765,33 +807,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted [[nodiscard("Use The New IQueue::SubmitInfo")]] IQueue::SSubmitInfo updateImageViaStagingBuffer( asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange& regions, IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo intendedNextSubmit); - - //! This function is an specialization of the `updateImageViaStagingBuffer` function above. - //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit - //! Parameters: - //! - `submitInfo`: IQueue::SSubmitInfo used to submit the copy operations. - //! * Use this parameter to wait for previous operations to finish using submitInfo::waitSemaphores or signal new semaphores using submitInfo::signalSemaphores - //! * Fill submitInfo::commandBuffers with the commandbuffers you want to be submitted before the copy in this struct as well, for example pipeline barrier commands. - //! * Empty by default: waits for no semaphore and signals no semaphores. - //! Patches the submitInfo::commandBuffers - //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands. - //! WARNING: If commandBufferCount > 0, The last commandBuffer won't be in the same state as it was before entering the function, because it needs to be `end()`ed and submitted - //! Valid Usage: - //! * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING` - //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. - void updateImageViaStagingBufferAutoSubmit( - asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange& regions, - IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo = {}); - - //! This function is an specialization of the `updateImageViaStagingBufferAutoSubmit` function above. - //! Additionally waits for the fence - //! WARNING: This function blocks CPU and stalls the GPU! - void updateImageViaStagingBufferAutoSubmit( - asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange& regions, - IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {} - ); #endif protected: @@ -806,72 +821,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return range; } -#if 0 // TODO: port - //! Internal tool used to patch command buffers in submit info. - class CSubmitInfoPatcher - { - public: - //! Patches the submitInfo::commandBuffers and then makes sure the last command buffer is in recording state - //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission - //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands. - //! Params: - //! - submitInfo: IQueue::SSubmitInfo to patch - //! - device: logical device to create new command pool and command buffer if necessary. - //! - newCommandPoolFamIdx: family index to create commandPool with if necessary. - inline void patchAndBegin(IQueue::SSubmitInfo& submitInfo, core::smart_refctd_ptr device, uint32_t newCommandPoolFamIdx) - { - bool needToCreateNewCommandBuffer = false; - if (submitInfo.commandBufferCount <= 0u) - needToCreateNewCommandBuffer = true; - else - { - auto lastCmdBuf = submitInfo.commandBuffers[submitInfo.commandBufferCount - 1u]; - if (lastCmdBuf->getState() == IGPUCommandBuffer::STATE::EXECUTABLE) - needToCreateNewCommandBuffer = true; - } - - // commandBuffer used to record the commands - if (needToCreateNewCommandBuffer) - { - core::smart_refctd_ptr pool = device->createCommandPool(newCommandPoolFamIdx, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - device->createCommandBuffers(pool.get(), IGPUCommandBuffer::LEVEL::PRIMARY, 1u, &m_newCommandBuffer); - - const uint32_t newCommandBufferCount = (needToCreateNewCommandBuffer) ? submitInfo.commandBufferCount + 1 : submitInfo.commandBufferCount; - m_allCommandBuffers.resize(newCommandBufferCount); - - for (uint32_t i = 0u; i < submitInfo.commandBufferCount; ++i) - m_allCommandBuffers[i] = submitInfo.commandBuffers[i]; - - m_recordCommandBuffer = m_newCommandBuffer.get(); - m_allCommandBuffers[newCommandBufferCount - 1u] = m_recordCommandBuffer; - - submitInfo.commandBufferCount = newCommandBufferCount; - submitInfo.commandBuffers = m_allCommandBuffers.data(); - - m_recordCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - } - else - { - m_recordCommandBuffer = submitInfo.commandBuffers[submitInfo.commandBufferCount - 1u]; - // If the last command buffer is in INITIAL state, bring it to RECORDING state - if (m_recordCommandBuffer->getState() == IGPUCommandBuffer::STATE::INITIAL) - m_recordCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - } - } - inline void end() - { - m_recordCommandBuffer->end(); - } - inline IGPUCommandBuffer* getRecordingCommandBuffer() { return m_recordCommandBuffer; } - - private: - IGPUCommandBuffer* m_recordCommandBuffer; - core::vector m_allCommandBuffers; - core::smart_refctd_ptr m_newCommandBuffer; // if necessary, then need to hold reference to. - }; -#endif core::smart_refctd_ptr m_device; core::smart_refctd_ptr > m_defaultDownloadBuffer; From 3d034c546fa5d44d43b2b31f319ae789320939b9 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 22:32:29 +0100 Subject: [PATCH 44/62] move the SIntendedSubmitInfo struct out of IUtilities --- include/nbl/video/utilities/IUtilities.h | 186 +---------------- .../nbl/video/utilities/SIntendedSubmitInfo.h | 195 ++++++++++++++++++ src/nbl/video/utilities/IUtilities.cpp | 42 +--- 3 files changed, 198 insertions(+), 225 deletions(-) create mode 100644 include/nbl/video/utilities/SIntendedSubmitInfo.h diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 55af9a3750..32baac4b95 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -7,11 +7,9 @@ #include "nbl/asset/asset.h" #include "nbl/asset/utils/ISPIRVOptimizer.h" -#include "nbl/video/IGPUBuffer.h" -#include "nbl/video/IGPUImage.h" -#include "nbl/video/ILogicalDevice.h" #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/alloc/StreamingTransientDataBuffer.h" +#include "nbl/video/utilities/SIntendedSubmitInfo.h" #include "nbl/video/utilities/CPropertyPoolHandler.h" #include "nbl/video/utilities/CScanner.h" #include "nbl/video/utilities/CComputeBlit.h" @@ -194,188 +192,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted )); return allocationSize; } - - //! Struct meant to be used with any Utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour. - //! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. - //! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` ! - //! for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied. - struct SIntendedSubmitInfo final - { - public: - inline bool valid() const - { - if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty()) - return false; - const auto* scratch = frontHalf.getScratchCommandBuffer(); - // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened - if (!scratch->isResettable()) - return false; - // It makes no sense to reuse the same commands for a second submission. - // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which - // frees have already been latched on the scratch semaphore you must signal anyway. - if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) - return false; - if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL) - return false; - return true; - } - - inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};} - - inline operator IQueue::SSubmitInfo() const - { - return { - .waitSemaphores = frontHalf.waitSemaphores, - .commandBuffers = frontHalf.commandBuffers, - .signalSemaphores = signalSemaphores - }; - } - - // One thing you might notice is that this results in a few implicit Memory and Execution Dependencies - // So there's a little bit of non-deterministic behaviour we won't fight (will not insert a barrier every time you "could-have" overflown) - inline void overflowSubmit() - { - auto cmdbuf = frontHalf.getScratchCommandBuffer(); - auto& scratchSemaphore = signalSemaphores.front(); - // but first sumbit the already buffered up copies - cmdbuf->end(); - IQueue::SSubmitInfo submit = *this; - // we only signal the last semaphore which is used as scratch - submit.signalSemaphores = {&scratchSemaphore,1}; - assert(submit.isValid()); - frontHalf.queue->submit({&submit,1}); - // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller - { - const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++}; - const_cast(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1}); - } - // we've already waited on the Host for the semaphores, no use waiting twice - frontHalf.waitSemaphores = {}; - // since all the commandbuffers have submitted already we only reuse the last one - frontHalf.commandBuffers = {&frontHalf.commandBuffers.back(),1}; - // we will still signal the same set in the future - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - } - - - //! The last CommandBuffer will be used to record the copy commands - struct SFrontHalf final - { - //! Need a valid queue and all the command buffers except the last one should be in `EXECUTABLE` state. - inline bool valid() const - { - if (!queue) - return false; - if (!commandBuffers.empty()) - for (size_t i=0; igetState()==IGPUCommandBuffer::STATE::EXECUTABLE) - return false; - return true; - } - - //! Little class to hold the storage for the modified commandbuffer span until submission time. - class CRAIISpanPatch final : core::Uncopyable - { - public: - inline ~CRAIISpanPatch() - { - toNullify->commandBuffers = {}; - } - inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));} - inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs) - { - commandBuffersStorage = std::move(rhs.commandBuffersStorage); - return *this; - } - - inline operator bool() const {return m_recordingCommandBuffer.get();} - - private: - friend SFrontHalf; - inline CRAIISpanPatch() = default; - inline CRAIISpanPatch(SFrontHalf* _toNull) : commandBuffersStorage(_toNull->commandBuffers.size()+1), toNullify(_toNull) {} - - core::vector commandBuffersStorage; - // If we made a new commandbuffer we need to nullify the span so it doesn't point at stale mem - SFrontHalf* toNullify = nullptr; - // If new one made, then need to hold reference to it, else its just an extra ref, but whatever - core::smart_refctd_ptr m_recordingCommandBuffer; - }; - //! Patches the `commandBuffers` and then makes sure the last command buffer is resettable, in recording state begun with ONE_TIME_SUBMIT - //! If we can't make the last cmdbuffer that way, we make a new one and add it onto the end (hence the name "patching") - //! If `commandBuffers.empty()`, it will create an implicit command buffer to use for recording commands, - //! else if the last command buffer is not feasible to use as scratch for whatever reason, - //! it will add another temporary command buffer to end of `commandBuffers` and use it for recording. - //! WARNING: If patching occurs: - //! - a submission must occur before the return value goes out of scope! - //! - if `!commandBuffers.empty()`, the last CommandBuffer won't be in the same state as it was before entering the function, - //! because it needs to be `end()`ed before the submission - //! - the destructor of the return value will clear `commandBuffers` span - //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. - [[nodiscard("The RAII object returned by `patch()` provides lifetimes to your spans!")]] - inline CRAIISpanPatch patch() - { - if (auto* candidateScratch = getScratchCommandBuffer(); candidateScratch && candidateScratch->isResettable()) - switch(candidateScratch->getState()) - { - case IGPUCommandBuffer::STATE::INITIAL: - if (!candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) - break; - [[fallthrough]]; - case IGPUCommandBuffer::STATE::RECORDING: - if (!candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) - break; - { - CRAIISpanPatch retval; - retval.m_recordingCommandBuffer = core::smart_refctd_ptr(candidateScratch); - return retval; - } - break; - default: - break; - } - - CRAIISpanPatch retval(this); - std::copy(commandBuffers.begin(),commandBuffers.end(),retval.commandBuffersStorage.begin()); - { - auto pool = const_cast(queue->getOriginDevice())->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&retval.m_recordingCommandBuffer,1})) - return {}; - if (!retval.m_recordingCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) - return {}; - retval.commandBuffersStorage.back().cmdbuf = retval.m_recordingCommandBuffer.get(); - } - commandBuffers = retval.commandBuffersStorage; - return retval; - } - - // Use the last command buffer in intendedNextSubmit, it should be in recording state - inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} - inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} - - // This parameter is required but may be unused if there is no need to submit - IQueue* queue = {}; - // Use this parameter to wait for previous operations to finish before whatever commands the Utility you're using records - std::span waitSemaphores = {}; - // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit, - // for example baked command buffers with pipeline barrier commands. - // Also remember that even though the last CommandBuffer is scratch, it you can record commands into it as well. - std::span commandBuffers = {}; - } frontHalf = {}; - //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount! - //! You can actually examine the change in `signalSemaphore.front().value` to figure out how many overflows occurred. - //! This semaphore is needed to "stitch together" additional submits if they occur so they occur before and after the original intended waits and signals. - //! We use the first semaphore to keep the intended order of original semaphore signal and waits unchanged no matter how many overflows occur. - //! You do however, NEED TO KEEP IT in the signal set of the last submit you're supposed to do manually, this allows freeing any resources used - //! after the submit is done, indicating that your streaming routine is done. - //! * Also use this parameter to signal new semaphores so that other submits know your Utility method is done. - std::span signalSemaphores = {}; - - private: - friend class IUtilities; - static const char* ErrorText; - }; //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h new file mode 100644 index 0000000000..19a33d927b --- /dev/null +++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h @@ -0,0 +1,195 @@ +#ifndef _NBL_VIDEO_S_INTENDED_SUBMIT_INFO_H_INCLUDED_ +#define _NBL_VIDEO_S_INTENDED_SUBMIT_INFO_H_INCLUDED_ + + +#include "nbl/video/IGPUCommandBuffer.h" + + +namespace nbl::video +{ + +//! Struct meant to be used with any Utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour. +//! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. +//! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` ! +//! for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied. +struct SIntendedSubmitInfo final +{ + public: + inline bool valid() const + { + if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty()) + return false; + const auto* scratch = frontHalf.getScratchCommandBuffer(); + // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened + if (!scratch->isResettable()) + return false; + // It makes no sense to reuse the same commands for a second submission. + // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which + // frees have already been latched on the scratch semaphore you must signal anyway. + if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + return false; + if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL) + return false; + return true; + } + + inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};} + + inline operator IQueue::SSubmitInfo() const + { + return { + .waitSemaphores = frontHalf.waitSemaphores, + .commandBuffers = frontHalf.commandBuffers, + .signalSemaphores = signalSemaphores + }; + } + + // One thing you might notice is that this results in a few implicit Memory and Execution Dependencies + // So there's a little bit of non-deterministic behaviour we won't fight (will not insert a barrier every time you "could-have" overflown) + inline void overflowSubmit() + { + auto cmdbuf = frontHalf.getScratchCommandBuffer(); + auto& scratchSemaphore = signalSemaphores.front(); + // but first sumbit the already buffered up copies + cmdbuf->end(); + IQueue::SSubmitInfo submit = *this; + // we only signal the last semaphore which is used as scratch + submit.signalSemaphores = {&scratchSemaphore,1}; + assert(submit.isValid()); + frontHalf.queue->submit({&submit,1}); + // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller + { + const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++}; + const_cast(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1}); + } + // we've already waited on the Host for the semaphores, no use waiting twice + frontHalf.waitSemaphores = {}; + // since all the commandbuffers have submitted already we only reuse the last one + frontHalf.commandBuffers = {&frontHalf.commandBuffers.back(),1}; + // we will still signal the same set in the future + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + } + + + //! The last CommandBuffer will be used to record the copy commands + struct SFrontHalf final + { + //! Need a valid queue and all the command buffers except the last one should be in `EXECUTABLE` state. + inline bool valid() const + { + if (!queue) + return false; + if (!commandBuffers.empty()) + for (size_t i=0; igetState()==IGPUCommandBuffer::STATE::EXECUTABLE) + return false; + return true; + } + + //! Little class to hold the storage for the modified commandbuffer span until submission time. + class CRAIISpanPatch final : core::Uncopyable + { + public: + inline ~CRAIISpanPatch() + { + toNullify->commandBuffers = {}; + } + inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));} + inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs) + { + commandBuffersStorage = std::move(rhs.commandBuffersStorage); + return *this; + } + + inline operator bool() const {return m_recordingCommandBuffer.get();} + + private: + friend SFrontHalf; + inline CRAIISpanPatch() = default; + inline CRAIISpanPatch(SFrontHalf* _toNull) : commandBuffersStorage(_toNull->commandBuffers.size()+1), toNullify(_toNull) {} + + core::vector commandBuffersStorage; + // If we made a new commandbuffer we need to nullify the span so it doesn't point at stale mem + SFrontHalf* toNullify = nullptr; + // If new one made, then need to hold reference to it, else its just an extra ref, but whatever + core::smart_refctd_ptr m_recordingCommandBuffer; + }; + //! Patches the `commandBuffers` and then makes sure the last command buffer is resettable, in recording state begun with ONE_TIME_SUBMIT + //! If we can't make the last cmdbuffer that way, we make a new one and add it onto the end (hence the name "patching") + //! If `commandBuffers.empty()`, it will create an implicit command buffer to use for recording commands, + //! else if the last command buffer is not feasible to use as scratch for whatever reason, + //! it will add another temporary command buffer to end of `commandBuffers` and use it for recording. + //! WARNING: If patching occurs: + //! - a submission must occur before the return value goes out of scope! + //! - if `!commandBuffers.empty()`, the last CommandBuffer won't be in the same state as it was before entering the function, + //! because it needs to be `end()`ed before the submission + //! - the destructor of the return value will clear `commandBuffers` span + //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments. + [[nodiscard("The RAII object returned by `patch()` provides lifetimes to your spans!")]] + inline CRAIISpanPatch patch() + { + if (auto* candidateScratch = getScratchCommandBuffer(); candidateScratch && candidateScratch->isResettable()) + switch(candidateScratch->getState()) + { + case IGPUCommandBuffer::STATE::INITIAL: + if (!candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + break; + [[fallthrough]]; + case IGPUCommandBuffer::STATE::RECORDING: + if (!candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + break; + { + CRAIISpanPatch retval; + retval.m_recordingCommandBuffer = core::smart_refctd_ptr(candidateScratch); + return retval; + } + break; + default: + break; + } + + CRAIISpanPatch retval(this); + std::copy(commandBuffers.begin(),commandBuffers.end(),retval.commandBuffersStorage.begin()); + { + auto pool = const_cast(queue->getOriginDevice())->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&retval.m_recordingCommandBuffer,1})) + return {}; + if (!retval.m_recordingCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) + return {}; + retval.commandBuffersStorage.back().cmdbuf = retval.m_recordingCommandBuffer.get(); + } + commandBuffers = retval.commandBuffersStorage; + return retval; + } + + // Use the last command buffer in intendedNextSubmit, it should be in recording state + inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} + inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;} + + // This parameter is required but may be unused if there is no need to submit + IQueue* queue = {}; + // Use this parameter to wait for previous operations to finish before whatever commands the Utility you're using records + std::span waitSemaphores = {}; + // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit, + // for example baked command buffers with pipeline barrier commands. + // Also remember that even though the last CommandBuffer is scratch, it you can record commands into it as well. + std::span commandBuffers = {}; + } frontHalf = {}; + //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount! + //! You can actually examine the change in `signalSemaphore.front().value` to figure out how many overflows occurred. + //! This semaphore is needed to "stitch together" additional submits if they occur so they occur before and after the original intended waits and signals. + //! We use the first semaphore to keep the intended order of original semaphore signal and waits unchanged no matter how many overflows occur. + //! You do however, NEED TO KEEP IT in the signal set of the last submit you're supposed to do manually, this allows freeing any resources used + //! after the submit is done, indicating that your streaming routine is done. + //! * Also use this parameter to signal new semaphores so that other submits know your Utility method is done. + std::span signalSemaphores = {}; + + private: + friend class IUtilities; + static const char* ErrorText; +}; + +} + +#endif \ No newline at end of file diff --git a/src/nbl/video/utilities/IUtilities.cpp b/src/nbl/video/utilities/IUtilities.cpp index 5ad7612f1e..397f7021a2 100644 --- a/src/nbl/video/utilities/IUtilities.cpp +++ b/src/nbl/video/utilities/IUtilities.cpp @@ -4,7 +4,7 @@ namespace nbl::video { -const char* IUtilities::SIntendedSubmitInfo::ErrorText = R"===(Invalid `IUtilities::SIntendedSubmitInfo`, possible reasons are: +const char* SIntendedSubmitInfo::ErrorText = R"===(Invalid `IUtilities::SIntendedSubmitInfo`, possible reasons are: - No `commandBuffers` or `signalSemaphores` given in respective spans - `commandBuffer.back()` is not Resettable - `commandBuffer.back()` is not already begun with ONE_TIME_SUBMIT_BIT @@ -168,44 +168,7 @@ IQueue::SSubmitInfo IUtilities::updateImageViaStagingBuffer( } return intendedNextSubmit; } - -void IUtilities::updateImageViaStagingBufferAutoSubmit( - asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, asset::IImage::LAYOUT currentDstImageLayout, const core::SRange& regions, - IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo -) -{ - if(!submitInfo.isValid()) - { - m_logger.log("submitInfo is invalid.", nbl::system::ILogger::ELL_ERROR); - assert(false); - return; - } - - CSubmitInfoPatcher submitInfoPatcher; - submitInfoPatcher.patchAndBegin(submitInfo, m_device, submissionQueue->getFamilyIndex()); - submitInfo = updateImageViaStagingBuffer(srcBuffer,srcFormat,dstImage,currentDstImageLayout,regions,submissionQueue,submissionFence,submitInfo); - submitInfoPatcher.end(); - - assert(submitInfo.isValid()); - submissionQueue->submit(1u,&submitInfo,submissionFence); -} - -void IUtilities::updateImageViaStagingBufferAutoSubmit( - asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, asset::IImage::LAYOUT currentDstImageLayout, const core::SRange& regions, - IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo -) -{ - if(!submitInfo.isValid()) - { - m_logger.log("submitInfo is invalid.", nbl::system::ILogger::ELL_ERROR); - assert(false); - return; - } - - auto fence = m_device->createFence(static_cast(0)); - updateImageViaStagingBufferAutoSubmit(srcBuffer,srcFormat,dstImage,currentDstImageLayout,regions,submissionQueue,fence.get(),submitInfo); - m_device->blockForFences(1u,&fence.get()); -} +#endif ImageRegionIterator::ImageRegionIterator( const core::SRange& copyRegions, @@ -762,6 +725,5 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo else return false; } -#endif } // namespace nbl::video \ No newline at end of file From 3160a464ab34b85714c356c4596007f9b40a6dc3 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 12 Jan 2024 23:07:19 +0100 Subject: [PATCH 45/62] going to sleep, next TODO is to implement the IUtilities::downloadBuffer methods --- include/nbl/video/utilities/SIntendedSubmitInfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h index 19a33d927b..d6f99edcd1 100644 --- a/include/nbl/video/utilities/SIntendedSubmitInfo.h +++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h @@ -55,7 +55,7 @@ struct SIntendedSubmitInfo final IQueue::SSubmitInfo submit = *this; // we only signal the last semaphore which is used as scratch submit.signalSemaphores = {&scratchSemaphore,1}; - assert(submit.isValid()); + assert(submit.valid()); frontHalf.queue->submit({&submit,1}); // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller { From 8670d421dc1e90ef730532dad7f9a373c6a0725a Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 13 Jan 2024 22:02:14 +0100 Subject: [PATCH 46/62] outline the TODO for @theoreticalphysicsftw --- include/nbl/ext/ScreenShot/ScreenShot.h | 166 +----------------------- 1 file changed, 3 insertions(+), 163 deletions(-) diff --git a/include/nbl/ext/ScreenShot/ScreenShot.h b/include/nbl/ext/ScreenShot/ScreenShot.h index 536c060ddb..4e04cf1e84 100644 --- a/include/nbl/ext/ScreenShot/ScreenShot.h +++ b/include/nbl/ext/ScreenShot/ScreenShot.h @@ -15,9 +15,10 @@ using namespace nbl::video; Create a ScreenShot with gpu image usage and save it to a file. The queue being passed must have TRANSFER capability. - TODO: Add support for downloading a region of a specific subresource + TODO (Mihailo): Add support for downloading a region of a specific subresource */ +#if 0 // TODO (Mihailo): port inline core::smart_refctd_ptr createScreenShot( ILogicalDevice* logicalDevice, IQueue* queue, @@ -199,169 +200,8 @@ inline bool createScreenShot( IAssetWriter::SAssetWriteParams writeParams(cpuImageView.get()); return assetManager->writeAsset(filename.string(),writeParams); // TODO: Use std::filesystem::path } - -} // namespace nbl::ext::ScreenShot - #endif -#ifdef OLD_CODE // code from old `ditt` branch: - /* - Download mip level image with gpu image usage and save it to IGPUBuffer. - Because of the fence placed by driver the function stalls the CPU - to wait on the GPU to finish, beware of that. - @see IDriverFence - */ - - //! TODO: HANDLE UNPACK ALIGNMENT - [[nodiscard]] core::smart_refctd_ptr downloadImageMipLevel(IDriver* driver, IGPUImage* source, IGPUBuffer* destination, uint32_t sourceMipLevel = 0u, size_t destOffset = 0ull, bool implicitflush = true) - { - // will change this, https://github.com/buildaworldnet/IrrlichtBAW/issues/148 - if (isBlockCompressionFormat(source->getCreationParameters().format)) - return nullptr; - - auto extent = source->getMipSize(sourceMipLevel); - IGPUImage::SBufferCopy pRegions[1u] = { {destOffset,extent.x,extent.y,{static_cast(0u),sourceMipLevel,0u,1u},{0u,0u,0u},{extent.x,extent.y,extent.z}} }; - driver->copyImageToBuffer(source, destination, 1u, pRegions); - - return driver->placeFence(implicitflush); - } - - /* - Create a ScreenShot with gpu image usage and save it to a file. - */ - bool createScreenShot(IVideoDriver* driver, IAssetManager* assetManager, const IGPUImageView* gpuImageView, const std::string& outFileName, E_FORMAT convertToFormat=EF_UNKNOWN) - { - auto fetchedImageViewParmas = gpuImageView->getCreationParameters(); - auto gpuImage = fetchedImageViewParmas.image; - auto fetchedImageParams = gpuImage->getCreationParameters(); - auto image = ICPUImage::create(std::move(fetchedImageParams)); - - auto texelBufferRowLength = IImageAssetHandlerBase::calcPitchInBlocks(fetchedImageParams.extent.width * getBlockDimensions(fetchedImageParams.format).X, getTexelOrBlockBytesize(fetchedImageParams.format)); - - auto regions = core::make_refctd_dynamic_array>(1u); - ICPUImage::SBufferCopy& region = regions->front(); - - region.imageSubresource.mipLevel = 0u; - region.imageSubresource.baseArrayLayer = 0u; - region.imageSubresource.layerCount = 1u; - region.bufferOffset = 0u; - region.bufferRowLength = texelBufferRowLength; - region.bufferImageHeight = 0u; - region.imageOffset = { 0u, 0u, 0u }; - region.imageExtent = image->getCreationParameters().extent; - - IDeviceMemoryBacked::SDeviceMemoryRequirements memoryRequirements; - memoryRequirements.vulkanReqs.alignment = 64u; - memoryRequirements.vulkanReqs.memoryTypeBits = 0xffffffffu; - memoryRequirements.memoryHeapLocation = IDeviceMemoryAllocation::ESMT_NOT_DEVICE_LOCAL; - memoryRequirements.mappingCapability = IDeviceMemoryAllocation::EMCF_CAN_MAP_FOR_READ | IDeviceMemoryAllocation::EMCF_COHERENT | IDeviceMemoryAllocation::EMCF_CACHED; - memoryRequirements.vulkanReqs.size = image->getImageDataSizeInBytes(); - auto destinationBuffer = driver->createGPUBufferOnDedMem(memoryRequirements); - - auto mapPointerGetterFence = downloadImageMipLevel(driver, gpuImage.get(), destinationBuffer.get()); - - auto destinationBoundMemory = destinationBuffer->getBoundMemory(); - destinationBoundMemory->mapMemoryRange(IDeviceMemoryAllocation::EMCAF_READ, { 0u, memoryRequirements.vulkanReqs.size }); - - auto correctedScreenShotTexelBuffer = core::make_smart_refctd_ptr(memoryRequirements.vulkanReqs.size); - bool flipImage = true; - if(flipImage) - { - auto extent = gpuImage->getMipSize(0u); - uint32_t rowByteSize = extent.x * getTexelOrBlockBytesize(gpuImage->getCreationParameters().format); - for(uint32_t y = 0; y < extent.y; ++y) - { - uint32_t flipped_y = extent.y - y - 1; - memcpy(reinterpret_cast(correctedScreenShotTexelBuffer->getPointer()) + rowByteSize * y, reinterpret_cast(destinationBoundMemory->getMappedPointer()) + rowByteSize * flipped_y, rowByteSize); - } - } - else - { - memcpy(correctedScreenShotTexelBuffer->getPointer(), destinationBoundMemory->getMappedPointer(), memoryRequirements.vulkanReqs.size); - } - - destinationBoundMemory->unmapMemory(); - - image->setBufferAndRegions(std::move(correctedScreenShotTexelBuffer), regions); - - while (mapPointerGetterFence->waitCPU(1000ull, mapPointerGetterFence->canDeferredFlush()) == EDFR_TIMEOUT_EXPIRED) {} - - core::smart_refctd_ptr convertedImage; - if (convertToFormat != EF_UNKNOWN) - { - auto referenceImageParams = image->getCreationParameters(); - auto referenceBuffer = image->getBuffer(); - auto referenceRegions = image->getRegions(); - auto referenceRegion = referenceRegions.begin(); - const auto newTexelOrBlockByteSize = getTexelOrBlockBytesize(convertToFormat); - - auto newImageParams = referenceImageParams; - auto newCpuBuffer = core::make_smart_refctd_ptr(referenceBuffer->getSize() * newTexelOrBlockByteSize); - auto newRegions = core::make_refctd_dynamic_array>(referenceRegions.size()); - - for (auto newRegion = newRegions->begin(); newRegion != newRegions->end(); ++newRegion) - { - *newRegion = *(referenceRegion++); - newRegion->bufferOffset = newRegion->bufferOffset * newTexelOrBlockByteSize; - } - - newImageParams.format = convertToFormat; - convertedImage = ICPUImage::create(std::move(newImageParams)); - convertedImage->setBufferAndRegions(std::move(newCpuBuffer), newRegions); - - //CConvertFormatImageFilter TODO: use this one instead with a nice dither @Anastazluk, we could also get rid of a lot of code here, since there's a bunch of constraints - CSwizzleAndConvertImageFilter<> convertFilter; - CSwizzleAndConvertImageFilter<>::state_type state; - - state.swizzle = {}; - state.inImage = image.get(); - state.outImage = convertedImage.get(); - state.inOffset = { 0, 0, 0 }; - state.inBaseLayer = 0; - state.outOffset = { 0, 0, 0 }; - state.outBaseLayer = 0; - //state.dither = ; - - for (auto itr = 0; itr < convertedImage->getCreationParameters().mipLevels; ++itr) - { - auto regionWithMipMap = convertedImage->getRegions(itr).begin(); - - state.extent = regionWithMipMap->getExtent(); - state.layerCount = regionWithMipMap->imageSubresource.layerCount; - state.inMipLevel = regionWithMipMap->imageSubresource.mipLevel; - state.outMipLevel = regionWithMipMap->imageSubresource.mipLevel; - - const bool ok = convertFilter.execute(core::execution::par_unseq,&state); - assert(ok); - } - } - else - convertedImage = image; - auto newCreationParams = convertedImage->getCreationParameters(); - - ICPUImageView::SCreationParams viewParams; - viewParams.flags = static_cast(0u); - viewParams.image = convertedImage; - viewParams.format = newCreationParams.format; - viewParams.viewType = ICPUImageView::ET_2D; - viewParams.subresourceRange.baseArrayLayer = 0u; - viewParams.subresourceRange.layerCount = newCreationParams.arrayLayers; - viewParams.subresourceRange.baseMipLevel = 0u; - viewParams.subresourceRange.levelCount = newCreationParams.mipLevels; - - auto imageView = ICPUImageView::create(std::move(viewParams)); - - auto tryToWrite = [&](IAsset* asset) - { - IAssetWriter::SAssetWriteParams wparams(asset); - return assetManager->writeAsset(outFileName, wparams); - }; - - bool status = tryToWrite(convertedImage.get()); - if (!status) - status = tryToWrite(imageView.get()); - - return status; +} // namespace nbl::ext::ScreenShot - } #endif \ No newline at end of file From 2d86373fc40a9c6f7874b5664b7178f182c9b78c Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 14 Jan 2024 00:02:31 +0300 Subject: [PATCH 47/62] fix debugmessenger not being created --- src/nbl/video/CVulkanConnection.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanConnection.cpp b/src/nbl/video/CVulkanConnection.cpp index 3c24dea895..dfdcf0510e 100644 --- a/src/nbl/video/CVulkanConnection.cpp +++ b/src/nbl/video/CVulkanConnection.cpp @@ -234,7 +234,7 @@ core::smart_refctd_ptr CVulkanConnection::create(core::smart_ std::unique_ptr debugCallback = std::make_unique(std::move(logger)); VkDebugUtilsMessengerCreateInfoEXT debugMessengerCreateInfo = { VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, nullptr }; - if (logger && enabledFeatures.debugUtils) + if (enabledFeatures.debugUtils) { debugMessengerCreateInfo.flags = 0; auto debugCallbackFlags = getDebugCallbackFlagsFromLogLevelMask(logLevelMask); @@ -321,4 +321,4 @@ CVulkanConnection::~CVulkanConnection() vkDestroyInstance(m_vkInstance,nullptr); } -} \ No newline at end of file +} From ca2593ce2a7d9f7586b5e057173ebe7967df097f Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 13 Jan 2024 22:49:35 +0100 Subject: [PATCH 48/62] fix a validation error --- src/nbl/video/CVulkanPhysicalDevice.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 030890b187..ecfdaa6f42 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1410,10 +1410,11 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic // extensions - REQUIRE_EXTENSION_IF(enabledFeatures.swapchainMode.hasFlags(E_SWAPCHAIN_MODE::ESM_SURFACE),VK_KHR_SWAPCHAIN_EXTENSION_NAME,nullptr); + const bool swapchainEnabled = enabledFeatures.swapchainMode.hasFlags(E_SWAPCHAIN_MODE::ESM_SURFACE); + REQUIRE_EXTENSION_IF(swapchainEnabled,VK_KHR_SWAPCHAIN_EXTENSION_NAME,nullptr); { // If we reach here then the instance extension VK_KHR_Surface was definitely enabled otherwise the extension wouldn't be reported by physical device - REQUIRE_EXTENSION_IF(true,VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME); + REQUIRE_EXTENSION_IF(swapchainEnabled,VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME); // TODO: https://github.com/Devsh-Graphics-Programming/Nabla/issues/508 } From 461cb4af0b09cb44306fcd571d3d6fd14231e1ae Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 13 Jan 2024 23:19:58 +0100 Subject: [PATCH 49/62] rework pipeline barriers and events to use std::spans --- include/nbl/video/IGPUCommandBuffer.h | 13 ++-- src/nbl/video/CVulkanCommandBuffer.cpp | 89 +++++++++++++------------- src/nbl/video/CVulkanCommandBuffer.h | 2 +- src/nbl/video/IGPUCommandBuffer.cpp | 63 +++++++++--------- 4 files changed, 82 insertions(+), 85 deletions(-) diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index fb50d2a7e5..1fd61a19c8 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -146,18 +146,15 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject using image_barrier_t = SImageMemoryBarrier; // no dependency flags because they must be 0 per the spec - uint32_t memBarrierCount = 0; - const asset::SMemoryBarrier* memBarriers = nullptr; - uint32_t bufBarrierCount = 0; - const buffer_barrier_t* bufBarriers = nullptr; - uint32_t imgBarrierCount = 0; - const image_barrier_t* imgBarriers = nullptr; + std::span memBarriers = {}; + std::span bufBarriers = {}; + std::span imgBarriers = {}; }; using SEventDependencyInfo = SDependencyInfo; bool setEvent(IEvent* const _event, const SEventDependencyInfo& depInfo); bool resetEvent(IEvent* _event, const core::bitflag stageMask); - bool waitEvents(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos); + bool waitEvents(const std::span events, const SEventDependencyInfo* depInfos); struct SOwnershipTransferBarrier { @@ -539,7 +536,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject virtual bool setEvent_impl(IEvent* const _event, const SEventDependencyInfo& depInfo) = 0; virtual bool resetEvent_impl(IEvent* const _event, const core::bitflag stageMask) = 0; - virtual bool waitEvents_impl(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos) = 0; + virtual bool waitEvents_impl(const std::span events, const SEventDependencyInfo* depInfos) = 0; virtual bool pipelineBarrier_impl(const core::bitflag dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) = 0; virtual bool fillBuffer_impl(const asset::SBufferRange& range, const uint32_t data) = 0; diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index 188ca33595..2b1f9d9070 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -98,57 +98,57 @@ static inline auto getVkImageSubresourceFrom(const SubresourceRange& range) -> s template VkDependencyInfoKHR fill( - VkMemoryBarrier2* memoryBarriers, VkBufferMemoryBarrier2* bufferBarriers, VkImageMemoryBarrier2* imageBarriers, + VkMemoryBarrier2* const memoryBarriers, VkBufferMemoryBarrier2* const bufferBarriers, VkImageMemoryBarrier2* const imageBarriers, const IGPUCommandBuffer::SDependencyInfo& depInfo, const uint32_t selfQueueFamilyIndex=IQueue::FamilyIgnored ) { VkDependencyInfoKHR info = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR,nullptr }; - for (auto i=0; isType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2_KHR; + outMem->pNext = nullptr; + fill(*(outMem++),in,selfQueueFamilyIndex); } - for (auto i=0; igetCachedCreationParams().isConcurrentSharing()); - out.buffer = static_cast(in.range.buffer.get())->getInternalObject(); - out.offset = in.range.offset; - out.size = in.range.size; + outBuf->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2_KHR; + outBuf->pNext = nullptr; // VkExternalMemoryAcquireUnmodifiedEXT + + fill(*outBuf,in.barrier,selfQueueFamilyIndex,in.range.buffer->getCachedCreationParams().isConcurrentSharing()); + outBuf->buffer = static_cast(in.range.buffer.get())->getInternalObject(); + outBuf->offset = in.range.offset; + outBuf->size = in.range.size; + outBuf++; } - for (auto i=0; igetCachedCreationParams().isConcurrentSharing()); - out.image = static_cast(in.image)->getInternalObject(); - out.subresourceRange = getVkImageSubresourceFrom(in.subresourceRange); + outImg->sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR; + outImg->pNext = nullptr; // VkExternalMemoryAcquireUnmodifiedEXT or VkSampleLocationsInfoEXT + + outImg->oldLayout = getVkImageLayoutFromImageLayout(in.oldLayout); + outImg->newLayout = getVkImageLayoutFromImageLayout(in.newLayout); + fill(*outImg,in.barrier,selfQueueFamilyIndex,in.image->getCachedCreationParams().isConcurrentSharing()); + outImg->image = static_cast(in.image)->getInternalObject(); + outImg->subresourceRange = getVkImageSubresourceFrom(in.subresourceRange); + outImg++; } info.dependencyFlags = 0u; - info.memoryBarrierCount = depInfo.memBarrierCount; + info.memoryBarrierCount = depInfo.memBarriers.size(); info.pMemoryBarriers = memoryBarriers; - info.bufferMemoryBarrierCount = depInfo.bufBarrierCount; + info.bufferMemoryBarrierCount = depInfo.bufBarriers.size(); info.pBufferMemoryBarriers = bufferBarriers; - info.imageMemoryBarrierCount = depInfo.imgBarrierCount; + info.imageMemoryBarrierCount = depInfo.imgBarriers.size(); info.pImageMemoryBarriers = imageBarriers; return info; } bool CVulkanCommandBuffer::setEvent_impl(IEvent* const _event, const SEventDependencyInfo& depInfo) { - IGPUCommandPool::StackAllocation memoryBarriers(m_cmdpool,depInfo.memBarrierCount); - IGPUCommandPool::StackAllocation bufferBarriers(m_cmdpool,depInfo.bufBarrierCount); - IGPUCommandPool::StackAllocation imageBarriers(m_cmdpool,depInfo.imgBarrierCount); + IGPUCommandPool::StackAllocation memoryBarriers(m_cmdpool,depInfo.memBarriers.size()); + IGPUCommandPool::StackAllocation bufferBarriers(m_cmdpool,depInfo.bufBarriers.size()); + IGPUCommandPool::StackAllocation imageBarriers(m_cmdpool,depInfo.imgBarriers.size()); if (!memoryBarriers || !bufferBarriers || !imageBarriers) return false; @@ -163,11 +163,12 @@ bool CVulkanCommandBuffer::resetEvent_impl(IEvent* const _event, const core::bit return true; } -bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos) +bool CVulkanCommandBuffer::waitEvents_impl(const std::span events, const SEventDependencyInfo* depInfos) { - IGPUCommandPool::StackAllocation events(m_cmdpool,eventCount); + const uint32_t eventCount = events.size(); + IGPUCommandPool::StackAllocation vk_events(m_cmdpool,eventCount); IGPUCommandPool::StackAllocation infos(m_cmdpool,eventCount); - if (!events || !infos) + if (!vk_events || !infos) return false; uint32_t memBarrierCount = 0u; @@ -175,9 +176,9 @@ bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* co uint32_t imgBarrierCount = 0u; for (auto i=0u; i memoryBarriers(m_cmdpool,memBarrierCount); IGPUCommandPool::StackAllocation bufferBarriers(m_cmdpool,bufBarrierCount); @@ -190,21 +191,21 @@ bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* co imgBarrierCount = 0u; for (auto i=0u; i(pEvents[i])->getInternalObject(); + vk_events[i] = static_cast(events[i])->getInternalObject(); infos[i] = fill(memoryBarriers.data()+memBarrierCount,bufferBarriers.data()+bufBarrierCount,imageBarriers.data()+imgBarrierCount,depInfos[i]); memBarrierCount += infos[i].memoryBarrierCount; bufBarrierCount += infos[i].bufferMemoryBarrierCount; imgBarrierCount += infos[i].imageMemoryBarrierCount; } - getFunctionTable().vkCmdWaitEvents2(m_cmdbuf,eventCount,events.data(),infos.data()); + getFunctionTable().vkCmdWaitEvents2(m_cmdbuf,eventCount,vk_events.data(),infos.data()); return true; } bool CVulkanCommandBuffer::pipelineBarrier_impl(const core::bitflag dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) { - IGPUCommandPool::StackAllocation memoryBarriers(m_cmdpool,depInfo.memBarrierCount); - IGPUCommandPool::StackAllocation bufferBarriers(m_cmdpool,depInfo.bufBarrierCount); - IGPUCommandPool::StackAllocation imageBarriers(m_cmdpool,depInfo.imgBarrierCount); + IGPUCommandPool::StackAllocation memoryBarriers(m_cmdpool,depInfo.memBarriers.size()); + IGPUCommandPool::StackAllocation bufferBarriers(m_cmdpool,depInfo.bufBarriers.size()); + IGPUCommandPool::StackAllocation imageBarriers(m_cmdpool,depInfo.imgBarriers.size()); if (!memoryBarriers || !bufferBarriers || !imageBarriers) return false; diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h index 3d18c7db0b..5e39300f48 100644 --- a/src/nbl/video/CVulkanCommandBuffer.h +++ b/src/nbl/video/CVulkanCommandBuffer.h @@ -48,7 +48,7 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer bool setEvent_impl(IEvent* const _event, const SEventDependencyInfo& depInfo) override; bool resetEvent_impl(IEvent* const _event, const core::bitflag stageMask) override; - bool waitEvents_impl(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos) override; + bool waitEvents_impl(const std::span events, const SEventDependencyInfo* depInfos) override; bool pipelineBarrier_impl(const core::bitflag dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) override; bool fillBuffer_impl(const asset::SBufferRange& range, const uint32_t data) override; diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 0f890ffdb8..6d2a3449c0 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -195,18 +195,18 @@ bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-None-07891 // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-None-07892 // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkBufferMemoryBarrier2-srcStageMask-03851 - for (auto j=0u; jvalidateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),depInfo.memBarriers[j])) + for (const auto& barrier : depInfo.memBarriers) + if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),barrier)) return true; - for (auto j=0u; jvalidateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),depInfo.bufBarriers[j])) + if (invalidBufferRange(barrier.range,1u,IGPUBuffer::EUF_NONE)) + if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),barrier)) return true; } - for (auto j=0u; jvalidateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),depInfo.imgBarriers[j])) + for (const auto& barrier : depInfo.imgBarriers) + if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),barrier)) return true; #endif // _NBL_DEBUG return false; @@ -261,19 +261,19 @@ bool IGPUCommandBuffer::resetEvent(IEvent* _event, const core::bitflag events, const SEventDependencyInfo* depInfos) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT,RENDERPASS_SCOPE::BOTH)) return false; - if (eventCount==0u) + if (events.empty()) return false; uint32_t totalBufferCount = 0u; uint32_t totalImageCount = 0u; - for (auto i=0u; iisCompatibleDevicewise(pEvents[i])) + if (!events[i] || !this->isCompatibleDevicewise(events[i])) return false; const auto& depInfo = depInfos[i]; @@ -283,24 +283,24 @@ bool IGPUCommandBuffer::waitEvents(const uint32_t eventCount, IEvent* const* con if (invalidDependency(depInfo)) return false; - totalBufferCount += depInfo.bufBarrierCount; - totalImageCount += depInfo.imgBarrierCount; + totalBufferCount += depInfo.bufBarriers.size(); + totalImageCount += depInfo.imgBarriers.size(); } - auto* cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,eventCount,pEvents,totalBufferCount,totalImageCount); + auto* cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,events.size(),events.data(),totalBufferCount,totalImageCount); if (!cmd) return false; auto outIt = cmd->getDeviceMemoryBacked(); - for (auto i=0u; i(depInfo.imgBarriers[j].image); + for (const auto& barrier : depInfo.bufBarriers) + *(outIt++) = barrier.range.buffer; + for (const auto& barrier : depInfo.imgBarriers) + *(outIt++) = core::smart_refctd_ptr(barrier.image); } - return waitEvents_impl(eventCount,pEvents,depInfos); + return waitEvents_impl(events,depInfos); } bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) @@ -308,7 +308,7 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag bool @@ -338,15 +338,14 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflagm_commandListPool.emplace(m_commandList,depInfo.bufBarrierCount,depInfo.imgBarrierCount); + auto* cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,depInfo.bufBarriers.size(),depInfo.imgBarriers.size()); if (!cmd) return false; auto outIt = cmd->getVariableCountResources(); - for (auto j=0u; j(depInfo.imgBarriers[j].image); + for (const auto& barrier : depInfo.bufBarriers) + *(outIt++) = barrier.range.buffer; + for (const auto& barrier : depInfo.imgBarriers) + *(outIt++) = core::smart_refctd_ptr(barrier.image); return pipelineBarrier_impl(dependencyFlags,depInfo); } From d96fd1d8db18b140b3afb8a9b7cf7e4ed1824a68 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 14 Jan 2024 00:16:54 +0100 Subject: [PATCH 50/62] Port `downloadBufferRangeViaStagingBuffer --- include/nbl/video/utilities/IUtilities.h | 105 +++++++---------------- 1 file changed, 32 insertions(+), 73 deletions(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 32baac4b95..ed5927d00d 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -298,16 +298,15 @@ class NBL_API2 IUtilities : public core::IReferenceCounted uint32_t localOffset = StreamingTransientDataBufferMT<>::invalid_value; m_defaultUploadBuffer.get()->multi_allocate(std::chrono::steady_clock::now()+std::chrono::microseconds(500u),1u,&localOffset,&allocationSize,&m_allocationAlignment); // copy only the unpadded part - if (localOffset != StreamingTransientDataBufferMT<>::invalid_value) + if (localOffset!=StreamingTransientDataBufferMT<>::invalid_value) { const void* dataPtr = reinterpret_cast(data) + uploadedSize; memcpy(reinterpret_cast(m_defaultUploadBuffer->getBufferPointer()) + localOffset, dataPtr, subSize); } - // keep trying again - if (localOffset == StreamingTransientDataBufferMT<>::invalid_value) + else { nextSubmit.overflowSubmit(); - continue; + continue; // keep trying again } // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate()) @@ -402,9 +401,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted StreamingTransientDataBufferMT<>* m_downstreamingBuffer; const size_t m_dstOffset; }; -#if 0 // TODO: port + //! Calls the callback to copy the data to a destination Offset - //! * IMPORTANT: To make the copies ready, IUtility::getDefaultDownStreamingBuffer()->cull_frees() should be called after the `submissionFence` is signaled. + //! * IMPORTANT: To make all the callbacks execute, IUtility::getDefaultDownStreamingBuffer()->cull_frees() should be called after the `nextSubmit.signalSemaphores.front()` is signaled. //! If the allocation from staging memory fails due to large image size or fragmentation then This function may need to submit the command buffer via the `submissionQueue` and then signal the fence. //! Returns: //! IQueue::SSubmitInfo to use for command buffer submission instead of `intendedNextSubmit`. @@ -438,50 +437,44 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! * submissionQueue must point to a valid IQueue //! * submissionFence must point to a valid IGPUFence //! * submissionFence must be in `UNSIGNALED` state - [[nodiscard("Use The New IQueue::SubmitInfo")]] inline IQueue::SSubmitInfo downloadBufferRangeViaStagingBuffer( - const std::function& consumeCallback, const asset::SBufferRange& srcBufferRange, - IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo intendedNextSubmit = {} - ) + inline bool downloadBufferRangeViaStagingBuffer(const std::function& consumeCallback, SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange& srcBufferRange) { - if (!intendedNextSubmit.isValid() || intendedNextSubmit.commandBufferCount <= 0u) + if (!srcBufferRange.isValid() || !srcBufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_SRC_BIT)) { - // TODO: log error -> intendedNextSubmit is invalid - assert(false); - return intendedNextSubmit; + m_logger.log("Invalid `srcBufferRange` or buffer has no `EUF_TRANSFER_SRC_BIT` usage flag, cannot `downloadBufferRangeViaStagingBuffer`!",system::ILogger::ELL_ERROR); + return false; } - // Use the last command buffer in intendedNextSubmit, it should be in recording state - auto& cmdbuf = intendedNextSubmit.commandBuffers[intendedNextSubmit.commandBufferCount - 1]; - - assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING && cmdbuf->isResettable()); - assert(cmdbuf->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)); + if (!nextSubmit.valid()) + { + m_logger.log(nextSubmit.ErrorText, system::ILogger::ELL_ERROR); + return false; + } const auto& limits = m_device->getPhysicalDevice()->getLimits(); const uint32_t optimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t); - auto* cmdpool = cmdbuf->getPool(); - assert(cmdpool->getQueueFamilyIndex() == submissionQueue->getFamilyIndex()); - + auto cmdbuf = nextSubmit.frontHalf.getScratchCommandBuffer(); // Basically downloadedSize is downloadRecordedIntoCommandBufferSize :D - for (size_t downloadedSize = 0ull; downloadedSize < srcBufferRange.size;) + for (size_t downloadedSize=0ull; downloadedSizemax_size(); + const uint32_t maxFreeBlock = m_defaultDownloadBuffer->max_size(); // get allocation size - const uint32_t allocationSize = getAllocationSizeForStreamingBuffer(notDownloadedSize, m_allocationAlignment, maxFreeBlock, optimalTransferAtom); - const uint32_t copySize = core::min(allocationSize, notDownloadedSize); + const uint32_t allocationSize = getAllocationSizeForStreamingBuffer(notDownloadedSize,m_allocationAlignment,maxFreeBlock,optimalTransferAtom); + const uint32_t copySize = core::min(allocationSize,notDownloadedSize); uint32_t localOffset = StreamingTransientDataBufferMT<>::invalid_value; m_defaultDownloadBuffer.get()->multi_allocate(std::chrono::steady_clock::now()+std::chrono::microseconds(500u),1u,&localOffset,&allocationSize,&m_allocationAlignment); - if (localOffset != StreamingTransientDataBufferMT<>::invalid_value) + if (localOffset!=StreamingTransientDataBufferMT<>::invalid_value) { IGPUCommandBuffer::SBufferCopy copy; copy.srcOffset = srcBufferRange.offset + downloadedSize; copy.dstOffset = localOffset; copy.size = copySize; - cmdbuf->copyBuffer(srcBufferRange.buffer.get(), m_defaultDownloadBuffer.get()->getBuffer(), 1u, ©); + cmdbuf->copyBuffer(srcBufferRange.buffer.get(),m_defaultDownloadBuffer->getBuffer(),1u,©); auto dataConsumer = core::make_smart_refctd_ptr( IDeviceMemoryAllocation::MemoryRange(localOffset,copySize), @@ -490,63 +483,29 @@ class NBL_API2 IUtilities : public core::IReferenceCounted m_defaultDownloadBuffer.get(), downloadedSize ); - m_defaultDownloadBuffer.get()->multi_deallocate(1u, &localOffset, &allocationSize, core::smart_refctd_ptr(submissionFence), &dataConsumer.get()); + m_defaultDownloadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&dataConsumer.get()); downloadedSize += copySize; } - else - { - // but first sumbit the already buffered up copies - cmdbuf->end(); - IQueue::SSubmitInfo submit = intendedNextSubmit; - submit.signalSemaphoreCount = 0u; - submit.pSignalSemaphores = nullptr; - assert(submit.isValid()); - submissionQueue->submit(1u, &submit, submissionFence); - m_device->blockForFences(1u, &submissionFence); - - intendedNextSubmit.commandBufferCount = 1u; - intendedNextSubmit.commandBuffers = &cmdbuf; - intendedNextSubmit.waitSemaphoreCount = 0u; - intendedNextSubmit.pWaitSemaphores = nullptr; - intendedNextSubmit.pWaitDstStageMask = nullptr; - - // before resetting we need poll all events in the allocator's deferred free list - m_defaultDownloadBuffer->cull_frees(); - // we can reset the fence and commandbuffer because we fully wait for the GPU to finish here - m_device->resetFences(1u, &submissionFence); - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - } + else // but first sumbit the already buffered up copies + nextSubmit.overflowSubmit(); } - return intendedNextSubmit; + return true; } //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above. //! Additionally waits for the fence //! WARNING: This function blocks CPU and stalls the GPU! - inline void downloadBufferRangeViaStagingBufferAutoSubmit( - const asset::SBufferRange& srcBufferRange, void* data, - IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {} - ) + inline bool downloadBufferRangeViaStagingBufferAutoSubmit(const SIntendedSubmitInfo::SFrontHalf& submit,const asset::SBufferRange& srcBufferRange, void* data) { - if (!submitInfo.isValid()) - { - // TODO: log error - assert(false); - return; - } - - - auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); - downloadBufferRangeViaStagingBufferAutoSubmit(std::function(default_data_consumption_callback_t(data)), srcBufferRange, submissionQueue, fence.get(), submitInfo); - auto* fenceptr = fence.get(); - m_device->blockForFences(1u, &fenceptr); + if (!autoSubmitAndBlock(submit,[&](SIntendedSubmitInfo& nextSubmit){return downloadBufferRangeViaStagingBuffer(default_data_consumption_callback_t(data),nextSubmit,srcBufferRange);})) + return false; - //! TODO: NOTE this method cannot be turned into a pure autoSubmitAndBlock + lambda because there's stuff to do AFTER the semaphore wait~! - m_defaultDownloadBuffer->cull_frees(); // its while(poll()) {} now IIRC + //! NOTE this method cannot be turned into a pure autoSubmitAndBlock + lambda because there's stuff to do AFTER the semaphore wait~! + m_defaultDownloadBuffer->cull_frees(); + return true; } -#endif + // -------------- // buildAccelerationStructures // -------------- From 2d2acc9e7affb5676e1e7b2635bfe044d10a3a24 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 14 Jan 2024 00:54:23 +0100 Subject: [PATCH 51/62] fix bug in CRAIISpanPatch --- include/nbl/video/utilities/IUtilities.h | 2 +- include/nbl/video/utilities/SIntendedSubmitInfo.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index ed5927d00d..492a1db027 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -496,7 +496,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above. //! Additionally waits for the fence //! WARNING: This function blocks CPU and stalls the GPU! - inline bool downloadBufferRangeViaStagingBufferAutoSubmit(const SIntendedSubmitInfo::SFrontHalf& submit,const asset::SBufferRange& srcBufferRange, void* data) + inline bool downloadBufferRangeViaStagingBufferAutoSubmit(const SIntendedSubmitInfo::SFrontHalf& submit, const asset::SBufferRange& srcBufferRange, void* data) { if (!autoSubmitAndBlock(submit,[&](SIntendedSubmitInfo& nextSubmit){return downloadBufferRangeViaStagingBuffer(default_data_consumption_callback_t(data),nextSubmit,srcBufferRange);})) return false; diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h index d6f99edcd1..654d73e324 100644 --- a/include/nbl/video/utilities/SIntendedSubmitInfo.h +++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h @@ -99,6 +99,8 @@ struct SIntendedSubmitInfo final inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs) { commandBuffersStorage = std::move(rhs.commandBuffersStorage); + std::swap(toNullify,rhs.toNullify); + std::swap(m_recordingCommandBuffer,rhs.m_recordingCommandBuffer); return *this; } From 60c1c3916b6e5f209b7597dc7995e3628b9933f1 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 14 Jan 2024 01:08:03 +0100 Subject: [PATCH 52/62] Ported Example 23, and fixed a few bugs here and there --- include/nbl/video/IGPUCommandBuffer.h | 2 +- include/nbl/video/utilities/SIntendedSubmitInfo.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index 1fd61a19c8..f9fd1b5225 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -667,7 +667,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject BOTH = OUTSIDE|INSIDE }; using queue_flags_t = IQueue::FAMILY_FLAGS; - bool checkStateBeforeRecording(const core::bitflag allowedQueueFlags=queue_flags_t::NONE, const core::bitflag renderpassScope=RENDERPASS_SCOPE::BOTH); + bool checkStateBeforeRecording(const core::bitflag allowedQueueFlags=~queue_flags_t::NONE, const core::bitflag renderpassScope=RENDERPASS_SCOPE::BOTH); template bool invalidDependency(const SDependencyInfo& depInfo) const; diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h index 654d73e324..5ad8369943 100644 --- a/include/nbl/video/utilities/SIntendedSubmitInfo.h +++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h @@ -28,7 +28,7 @@ struct SIntendedSubmitInfo final // frees have already been latched on the scratch semaphore you must signal anyway. if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT)) return false; - if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL) + if (scratch->getState()!=IGPUCommandBuffer::STATE::RECORDING) return false; return true; } @@ -93,7 +93,8 @@ struct SIntendedSubmitInfo final public: inline ~CRAIISpanPatch() { - toNullify->commandBuffers = {}; + if (toNullify) + toNullify->commandBuffers = {}; } inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));} inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs) From 3faf1fb4d3d5322fef0b5e9643513956f4f623ee Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 13 Jan 2024 22:33:00 +0300 Subject: [PATCH 53/62] merge conflicts --- include/nbl/video/CCUDASharedMemory.h | 4 ---- include/nbl/video/utilities/IUtilities.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 9b3e4a0551..1ae9f32ff6 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -20,10 +20,6 @@ namespace nbl::video { -class CCUDAMemoryMapping: public core::IReferenceCounted -{ -}; - class CCUDASharedMemory : public core::IReferenceCounted { public: diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 492a1db027..983c2ab277 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -234,7 +234,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! WARNING: This function blocks CPU and stalls the GPU! inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function& what) { - auto semaphore = m_device->createSemaphore(0); + auto semaphore = m_device->createSemaphore(ISemaphore::SCreationParams{.initialValue=0}); // so we begin latching everything on the value of 1, but if we overflow it increases IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1}; From fd4f7333bf12332ebc5f76fd7f0feb9f01df0d73 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 14 Jan 2024 03:26:13 +0300 Subject: [PATCH 54/62] add missing external resource property queries --- include/nbl/asset/IBuffer.h | 2 + include/nbl/video/IDeviceMemoryAllocation.h | 24 ---- include/nbl/video/IPhysicalDevice.h | 139 ++++++++++++++++++++ src/nbl/video/CVulkanLogicalDevice.cpp | 35 ++++- src/nbl/video/CVulkanPhysicalDevice.h | 69 ++++++++++ 5 files changed, 242 insertions(+), 27 deletions(-) diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index e11d8faf7d..d50a415e69 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -42,6 +42,8 @@ class IBuffer : public core::IBuffer, public IDescriptor //! synthetic Nabla inventions // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, + + EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 64529858ec..28ad0dcfa3 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -83,30 +83,6 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, }; - /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */ - struct SExternalMemoryProperties - { - uint32_t exportableTypes : 7 = ~0u; - uint32_t compatibleTypes : 7 = ~0u; - uint32_t dedicatedOnly : 1 = 0u; - uint32_t exportable : 1 = ~0u; - uint32_t importable : 1 = ~0u; - - bool operator == (SExternalMemoryProperties const& rhs) const = default; - - SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const - { - rhs.exportableTypes &= exportableTypes; - rhs.compatibleTypes &= compatibleTypes; - rhs.dedicatedOnly |= dedicatedOnly; - rhs.exportable &= exportable; - rhs.importable &= importable; - return rhs; - } - }; - - static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); - // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 583c8ac9d0..30459e1667 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -26,8 +26,54 @@ namespace nbl::video { + + + class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable { + template static constexpr bool is_bitflag = false; + template static constexpr bool is_bitflag> = true; + + template struct RequestMapTraits; + templatestruct RequestMapTraits : RequestMapTraits {}; + template struct RequestMapTraits + { + using Key = std::tuple...>; + struct Hasher + { + template + static size_t hash(size_t seed, Key const& key) + { + if constexpr (0 == N) + return seed; + else + { + using cur = std::remove_cvref_t(key))>; + + if constexpr (is_bitflag) + core::hash_combine(seed, cur::UNDERLYING_TYPE(std::get(key).value)); + else if constexpr (std::is_convertible_v) + core::hash_combine(seed, size_t(std::get(key))); + else + core::hash_combine(seed, std::get(key)); + + return hash(seed, key); + } + + } + + size_t operator()(Key const& key) const + { + return hash(0, key); + } + }; + + using Map = std::unordered_map; + }; + + template + using RequestMap = typename RequestMapTraits::Map; + public: // virtual E_API_TYPE getAPIType() const = 0; @@ -242,6 +288,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable !! Same goes for `vkGetPhysicalDeviceSparseImageFormatProperties2` */ + struct SFormatBufferUsages { struct SUsage @@ -687,6 +734,81 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return createLogicalDevice_impl(std::move(params)); } + + /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */ + struct SExternalMemoryProperties + { + uint32_t exportableTypes : 7 = ~0u; + uint32_t compatibleTypes : 7 = ~0u; + uint32_t dedicatedOnly : 1 = 0u; + uint32_t exportable : 1 = ~0u; + uint32_t importable : 1 = ~0u; + + bool operator == (SExternalMemoryProperties const& rhs) const = default; + + SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const + { + rhs.exportableTypes &= exportableTypes; + rhs.compatibleTypes &= compatibleTypes; + rhs.dedicatedOnly |= dedicatedOnly; + rhs.exportable &= exportable; + rhs.importable &= importable; + return rhs; + } + }; + + static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); + + struct SImageFormatProperties + { + VkExtent3D maxExtent = {}; + uint32_t maxMipLevels = {}; + uint32_t maxArrayLayers = {}; + IGPUImage::E_SAMPLE_COUNT_FLAGS sampleCounts = IGPUImage::ESCF_1_BIT; + uint64_t maxResourceSize = 0; + + bool operator == (SImageFormatProperties const& rhs) const = default; + }; + + struct SExternalImageFormatProperties : SImageFormatProperties, SExternalMemoryProperties + { + }; + + SExternalMemoryProperties getExternalBufferProperties( + core::bitflag usage, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + usage &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags + { + std::shared_lock lock(m_externalBufferPropertiesMutex); + auto it = m_externalBufferProperties.find({ usage, handleType }); + if (it != m_externalBufferProperties.end()) + return it->second; + } + + std::unique_lock lock(m_externalBufferPropertiesMutex); + return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); + } + + SExternalImageFormatProperties getExternalImageProperties( + asset::E_FORMAT format, + IGPUImage::TILING tiling, + core::bitflag usage, + core::bitflag flags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + auto key = std::tuple{ format, tiling, usage, flags, handleType }; + { + std::shared_lock lock(m_externalImagePropertiesMutex); + auto it = m_externalImageProperties.find(key); + if (it != m_externalImageProperties.end()) + return it->second; + } + + std::unique_lock lock(m_externalImagePropertiesMutex); + return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, usage, flags, handleType); + } + protected: struct SInitData final { @@ -745,6 +867,23 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return 220u; // largest from above } + // external memory + /* ExternalBufferProperties *//* provided by VK_KHR_external_memory_capabilities */ + + + virtual SExternalMemoryProperties getExternalBufferProperties_impl(core::bitflag usage, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + mutable RequestMap m_externalBufferProperties; + mutable std::shared_mutex m_externalBufferPropertiesMutex; + + virtual SExternalImageFormatProperties getExternalImageProperties_impl( + asset::E_FORMAT format, + IGPUImage::TILING tiling, + core::bitflag usage, + core::bitflag flags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + mutable RequestMap m_externalImageProperties; + mutable std::shared_mutex m_externalImagePropertiesMutex; + // Format Promotion struct SBufferFormatPromotionRequestHash { diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 844bfc54cb..39b61d7c53 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -337,15 +337,44 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) { VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; - // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR - vk_createInfo.pNext = nullptr; + // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkBufferDeviceAddressCreateInfoEXT, VkBufferOpaqueCaptureAddressCreateInfo, VkDedicatedAllocationBufferCreateInfoNV, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR + + VkExternalMemoryBufferCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO, + .handleTypes = creationParams.externalHandleTypes.value, + }; + + const bool external = creationParams.externalHandleTypes.value; + + vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); - vk_createInfo.sharingMode = creationParams.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; + vk_createInfo.sharingMode = creationParams.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = creationParams.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = creationParams.queueFamilyIndices; + bool dedicatedOnly = false; + + if (external) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) + { + const auto handleType = static_cast(1u << (idx - 1)); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types + return nullptr; + + // TODO: Handle this + dedicatedOnly = props.dedicatedOnly; + } + } + VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index c1552c88f1..56069a3dd4 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,6 +109,75 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } + inline static SExternalMemoryProperties mapExternalMemoryProps(VkExternalMemoryProperties const& props) + { + return { + .exportableTypes = props.exportFromImportedHandleTypes, + .compatibleTypes = props.compatibleHandleTypes, + .dedicatedOnly = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT ? 1u : 0u, + .exportable = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT ? 1u : 0u, + .importable = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT ? 1u : 0u, + }; + } + + SExternalMemoryProperties getExternalBufferProperties_impl(core::bitflag usage, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override + { + assert(!(handleType & (handleType - 1))); + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO, + .usage = static_cast(usage.value), + .handleType = static_cast(handleType) + }; + VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES }; + vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps); + return mapExternalMemoryProps(externalProps.externalMemoryProperties); + } + + SExternalImageFormatProperties getExternalImageProperties_impl( + asset::E_FORMAT format, + IGPUImage::TILING tiling, + core::bitflag usage, + core::bitflag flags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override + { + assert(!(handleType & (handleType - 1))); + + VkPhysicalDeviceExternalImageFormatInfo extInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, + .handleType = static_cast(handleType), + }; + + VkPhysicalDeviceImageFormatInfo2 info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = &extInfo, + .format = static_cast(format), + .tiling = static_cast(tiling), + .usage = usage.value, + .flags = flags.value, + }; + + VkExternalImageFormatProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES }; + + VkImageFormatProperties2 props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = &externalProps, + }; + + vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props); + + return + { + { + .maxExtent = props.imageFormatProperties.maxExtent, + .maxMipLevels = props.imageFormatProperties.maxMipLevels, + .maxArrayLayers = props.imageFormatProperties.maxArrayLayers, + .sampleCounts = static_cast(props.imageFormatProperties.sampleCounts), + .maxResourceSize = props.imageFormatProperties.maxResourceSize, + }, + mapExternalMemoryProps(externalProps.externalMemoryProperties) + }; + } + core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; private: From 5b1940cc22f1affe6a7deed2022d0a51966952a8 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 14 Jan 2024 04:40:31 +0300 Subject: [PATCH 55/62] add more stuff --- include/nbl/video/CCUDASharedMemory.h | 2 +- include/nbl/video/CVulkanDeviceMemoryBacked.h | 6 +- include/nbl/video/IPhysicalDevice.h | 6 +- src/nbl/video/CCUDASharedMemory.cpp | 8 +-- src/nbl/video/CVulkanBuffer.h | 2 +- src/nbl/video/CVulkanDeviceMemoryBacked.cpp | 6 +- src/nbl/video/CVulkanLogicalDevice.cpp | 60 +++++++++++++++---- src/nbl/video/CVulkanPhysicalDevice.h | 8 ++- 8 files changed, 68 insertions(+), 30 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 1ae9f32ff6..117a1ff4b5 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -49,7 +49,7 @@ class CCUDASharedMemory : public core::IReferenceCounted core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - core::smart_refctd_ptr exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; + core::smart_refctd_ptr createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const; protected: diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index c996000e04..2505de6865 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface protected: // special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here) CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle); - CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) : - CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {} + CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) : + CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {} private: - static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle); + static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle); core::smart_refctd_ptr m_memory = nullptr; size_t m_offset = 0u; diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 30459e1667..e32a65d9f3 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -793,11 +793,12 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable SExternalImageFormatProperties getExternalImageProperties( asset::E_FORMAT format, IGPUImage::TILING tiling, + IGPUImage::E_TYPE type, core::bitflag usage, core::bitflag flags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const { - auto key = std::tuple{ format, tiling, usage, flags, handleType }; + auto key = std::tuple{ format, tiling, type, usage, flags, handleType }; { std::shared_lock lock(m_externalImagePropertiesMutex); auto it = m_externalImageProperties.find(key); @@ -806,7 +807,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable } std::unique_lock lock(m_externalImagePropertiesMutex); - return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, usage, flags, handleType); + return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, type, usage, flags, handleType); } protected: @@ -878,6 +879,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable virtual SExternalImageFormatProperties getExternalImageProperties_impl( asset::E_FORMAT format, IGPUImage::TILING tiling, + IGPUImage::E_TYPE type, core::bitflag usage, core::bitflag flags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 3ebb8e211d..82d6f496cd 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -77,16 +77,12 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDev #endif -core::smart_refctd_ptr CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const +core::smart_refctd_ptr CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const { if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) return nullptr; - auto img = device->createImage({ - std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }}, - IGPUImage::TILING::LINEAR, - 1 /*preinitialized*/, - }); + auto img = device->createImage(std::move(params)); if (exportAsMemory(device, img.get())) return img; diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h index 4596981c2a..988d50c2ec 100644 --- a/src/nbl/video/CVulkanBuffer.h +++ b/src/nbl/video/CVulkanBuffer.h @@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked using base_t = CVulkanDeviceMemoryBacked; public: - inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {} + inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev,std::move(creationParams), dedicatedOnly, buffer) {} void setObjectDebugName(const char* label) const override; diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 2bec9e9d06..8f08f9aa67 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -6,7 +6,7 @@ namespace nbl::video { template -IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle) +IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle) { const std::conditional_t vk_memoryRequirementsInfo = { IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked CVulkanLogicalDevice::createBuffer_impl(IGPUB VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(creationParams),vk_buffer); + return core::make_smart_refctd_ptr(this,std::move(creationParams), dedicatedOnly, vk_buffer); } core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) @@ -399,17 +399,24 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) { - VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr }; - vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value,true); + VkExternalMemoryImageCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = params.externalHandleTypes.value, + }; + + const bool external = params.externalHandleTypes.value; + + VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, &externalMemoryInfo }; + vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value, true); - std::array vk_formatList; + std::array vk_formatList; VkImageFormatListCreateInfo vk_formatListStruct = { VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO, &vk_stencilUsage }; vk_formatListStruct.viewFormatCount = 0u; // if only there existed a nice iterator that would let me iterate over set bits 64 faster if (params.viewFormats.any()) - for (auto fmt=0; fmt(fmt)); + for (auto fmt = 0; fmt < vk_formatList.size(); fmt++) + if (params.viewFormats.test(fmt)) + vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast(fmt)); vk_formatListStruct.pViewFormats = vk_formatList.data(); VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct }; @@ -421,16 +428,45 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.arrayLayers = params.arrayLayers; vk_createInfo.samples = static_cast(params.samples); vk_createInfo.tiling = static_cast(params.tiling); - vk_createInfo.usage = getVkImageUsageFlagsFromImageUsageFlags(params.usage.value,asset::isDepthOrStencilFormat(params.format)); - vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; + vk_createInfo.usage = getVkImageUsageFlagsFromImageUsageFlags(params.usage.value, asset::isDepthOrStencilFormat(params.format)); + vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; - vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED; + vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED; + + bool dedicatedOnly = false; + if (external) + { + core::bitflag requestedTypes = params.externalHandleTypes; + auto pd = dynamic_cast(m_physicalDevice)->getInternalObject(); + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) + { + const auto handleType = static_cast(1u << (idx - 1)); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType); + + if (props.maxArrayLayers < vk_createInfo.arrayLayers || + !core::bitflag(props.sampleCounts).hasFlags(params.samples) || + /* props.maxResourceSize?? */ + props.maxExtent.width < vk_createInfo.extent.width || + props.maxExtent.height < vk_createInfo.extent.height || + props.maxExtent.depth < vk_createInfo.extent.depth) + { + return nullptr; + } + + if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types + return nullptr; + + dedicatedOnly |= props.dedicatedOnly; + } + } VkImage vk_image; - if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) + if (m_devf.vk.vkCreateImage(m_vkdev, &vk_createInfo, nullptr, &vk_image) != VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(params),vk_image); + return core::make_smart_refctd_ptr(this, std::move(params), dedicatedOnly, vk_image); } core::smart_refctd_ptr CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params) diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index 56069a3dd4..9cfebccd3f 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -136,6 +136,7 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice SExternalImageFormatProperties getExternalImageProperties_impl( asset::E_FORMAT format, IGPUImage::TILING tiling, + IGPUImage::E_TYPE type, core::bitflag usage, core::bitflag flags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override @@ -150,7 +151,8 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice VkPhysicalDeviceImageFormatInfo2 info = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, .pNext = &extInfo, - .format = static_cast(format), + .format = getVkFormatFromFormat(format), + .type = static_cast(type), .tiling = static_cast(tiling), .usage = usage.value, .flags = flags.value, @@ -163,7 +165,9 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice .pNext = &externalProps, }; - vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props); + VkResult re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props); + if(VK_SUCCESS != re) + return {}; return { From 3d9a5309206fe219180d3610d3fb82ac93c4458c Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 18 Jan 2024 19:11:02 +0300 Subject: [PATCH 56/62] address pr comments --- include/nbl/video/CCUDADevice.h | 19 ++++++------- include/nbl/video/CCUDASharedMemory.h | 5 ++-- include/nbl/video/IDeviceMemoryAllocation.h | 27 +++++++++++------- include/nbl/video/IDeviceMemoryAllocator.h | 16 +++-------- include/nbl/video/IDeviceMemoryBacked.h | 2 +- include/nbl/video/ILogicalDevice.h | 31 ++++++--------------- include/nbl/video/ISemaphore.h | 4 +-- include/nbl/video/SPhysicalDeviceLimits.h | 4 --- include/nbl/video/utilities/IUtilities.h | 2 +- 9 files changed, 44 insertions(+), 66 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 7b2b952548..551c2a7e5b 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -37,6 +37,13 @@ class CCUDADevice : public core::IReferenceCounted static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif + struct SCUDACleaner : video::ICleanup + { + core::smart_refctd_ptr resource; + SCUDACleaner(core::smart_refctd_ptr resource) + : resource(std::move(resource)) + { } + }; enum E_VIRTUAL_ARCHITECTURE { @@ -95,18 +102,10 @@ class CCUDADevice : public core::IReferenceCounted protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + + // CUDAHandler creates CUDADevice, it needs to access ctor friend class CCUDAHandler; - friend class CCUDASharedMemory; - friend class CCUDASharedSemaphore; - struct SCUDACleaner : video::ICleanup - { - core::smart_refctd_ptr resource; - SCUDACleaner(core::smart_refctd_ptr resource) - : resource(std::move(resource)) - { } - }; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr&& _handler); ~CCUDADevice(); diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 117a1ff4b5..d900087d06 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -23,6 +23,7 @@ namespace nbl::video class CCUDASharedMemory : public core::IReferenceCounted { public: + // required for us to see the move ctor friend class CCUDADevice; CUdeviceptr getDeviceptr() const { return m_params.ptr; } @@ -49,11 +50,11 @@ class CCUDASharedMemory : public core::IReferenceCounted core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - core::smart_refctd_ptr createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const; + core::smart_refctd_ptr createAndBindImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; protected: - CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + CCUDASharedMemory(core::smart_refctd_ptr&& device, SCachedCreationParams&& params) : m_device(std::move(device)) , m_params(std::move(params)) {} diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 28ad0dcfa3..9ca663b9ea 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -164,14 +164,21 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } - struct SCreationParams + struct SInfo + { + uint64_t allocationSize = 0; + core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE + void* externalHandle = nullptr; + }; + + struct SCreationParams: SInfo { - core::bitflag allocateFlags = E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE; core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; - E_EXTERNAL_HANDLE_TYPE externalHandleType = E_EXTERNAL_HANDLE_TYPE::EHT_NONE; - void* externalHandle = nullptr; const bool dedicated = false; - const size_t allocationSize; }; protected: @@ -183,10 +190,10 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted IDeviceMemoryAllocation( const ILogicalDevice* originDevice, SCreationParams&& params = {}) : m_originDevice(originDevice) + , m_params(std::move(params)) , m_mappedPtr(nullptr) , m_mappedRange{ 0, 0 } , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) - , m_params(std::move(params)) {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; @@ -194,10 +201,10 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted const ILogicalDevice* m_originDevice = nullptr; - uint8_t* m_mappedPtr; - MemoryRange m_mappedRange; - core::bitflag m_currentMappingAccess; - SCreationParams m_params; + SCreationParams m_params = {}; + uint8_t* m_mappedPtr = nullptr; + MemoryRange m_mappedRange = {}; + core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; std::unique_ptr m_postDestroyCleanup = nullptr; }; diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 408efd6da4..22ea3c8238 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -12,19 +12,11 @@ namespace nbl::video class IDeviceMemoryAllocator { public: - struct SAllocateInfo + struct SAllocateInfo: IDeviceMemoryAllocation::SInfo { - size_t size : 54 = 0ull; - size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS - size_t memoryTypeIndex : 5 = 0u; + uint32_t memoryTypeIndex = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. - - // Handle Type for external resources - IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; - //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE - //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE - void* externalHandle = nullptr; }; //! IMemoryTypeIterator extracts memoryType indices from memoryTypeBits in arbitrary order @@ -54,8 +46,8 @@ class IDeviceMemoryAllocator inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) { SAllocateInfo ret = {}; - ret.size = m_reqs.size; - ret.flags = m_allocateFlags; + ret.allocationSize = m_reqs.size; + ret.allocateFlags = core::bitflag(m_allocateFlags); ret.memoryTypeIndex = dereference(); ret.dedication = dedication; ret.externalHandleType = m_handleType; diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index 278e681a35..0071a53d71 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -126,7 +126,7 @@ class IDeviceMemoryBacked : public IBackendObject //! members SCachedCreationParams m_cachedCreationParams; - SDeviceMemoryRequirements m_cachedMemoryReqs; + const SDeviceMemoryRequirements m_cachedMemoryReqs; void* m_cachedExternalHandle = nullptr; }; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 49585f3413..a102005371 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -147,7 +147,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual IQueue::RESULT waitIdle() const = 0; //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&&) = 0; + virtual core::smart_refctd_ptr createSemaphore(uint64_t initialValue = 0, ISemaphore::SCreationParams&& = {}) = 0; virtual ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) = 0; // Forever waiting variant if you're confident that the fence will eventually be signalled inline ISemaphore::WAIT_RESULT blockForSemaphores(const std::span infos, const bool waitAll=true) @@ -285,29 +285,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams) - { - const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; - if (creationParams.size>maxSize) - { - m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize); - return nullptr; - } - return createBuffer_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); + // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); + // Creates an Image (@see ICPUImage) - inline core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams) - { - if (!IGPUImage::validateCreationParameters(creationParams)) - { - m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); - return nullptr; - } - // TODO: @Cyprian validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage - return createImage_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& params); + // Create an ImageView that can actually be used by shaders (@see ICPUImageView) inline core::smart_refctd_ptr createImageView(IGPUImageView::SCreationParams&& params) { @@ -765,9 +750,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0; virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0; - virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) = 0; - virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) = 0; + virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 0b14590e83..5434591fb6 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -69,8 +69,6 @@ class ISemaphore : public IBackendObject //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE void* externalHandle = nullptr; - - uint64_t initialValue = 0; }; auto const& getCreationParams() const @@ -85,7 +83,7 @@ class ISemaphore : public IBackendObject {} virtual ~ISemaphore() = default; - SCreationParams m_creationParams; + const SCreationParams m_creationParams; }; } diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h index c3e13f145b..b639f37230 100644 --- a/include/nbl/video/SPhysicalDeviceLimits.h +++ b/include/nbl/video/SPhysicalDeviceLimits.h @@ -552,10 +552,6 @@ struct SPhysicalDeviceLimits /* CooperativeMatrixPropertiesKHR *//* VK_KHR_cooperative_matrix */ core::bitflag cooperativeMatrixSupportedStages = asset::IShader::ESS_UNKNOWN; - bool externalFenceWin32 = false; /* VK_KHR_external_fence_win32 */ // [TODO] requires instance extensions, add them - bool externalMemoryWin32 = false; /* VK_KHR_external_memory_win32 */ // [TODO] requires instance extensions, add them - bool externalSemaphoreWin32 = false; /* VK_KHR_external_semaphore_win32 */ // [TODO] requires instance extensions, add them - /* Always enabled if available, reported as limits */ // Core 1.0 Features diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 983c2ab277..d91fe09107 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -234,7 +234,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! WARNING: This function blocks CPU and stalls the GPU! inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function& what) { - auto semaphore = m_device->createSemaphore(ISemaphore::SCreationParams{.initialValue=0}); + auto semaphore = m_device->createSemaphore(); // so we begin latching everything on the value of 1, but if we overflow it increases IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1}; From 4d174e530335fe3d7ac3a1e9c66dca3d466fcea5 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 18 Jan 2024 20:10:19 +0300 Subject: [PATCH 57/62] last commit part 2 --- src/nbl/video/CCUDADevice.cpp | 2 +- src/nbl/video/CCUDASharedMemory.cpp | 46 ++--------- src/nbl/video/CVulkanImage.cpp | 1 - src/nbl/video/CVulkanLogicalDevice.cpp | 100 +++++++----------------- src/nbl/video/CVulkanLogicalDevice.h | 6 +- src/nbl/video/CVulkanPhysicalDevice.cpp | 4 +- src/nbl/video/IGPUCommandBuffer.cpp | 2 +- src/nbl/video/ILogicalDevice.cpp | 70 +++++++++++++++++ 8 files changed, 110 insertions(+), 121 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 39faaaa0ed..9fbb635f52 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -139,7 +139,7 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { IDeviceMemoryAllocator::SAllocateInfo info = { - .size = m_params.granularSize, - .externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - .externalHandle = m_params.osHandle, + { + .allocationSize = m_params.granularSize, + .externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + .externalHandle = m_params.osHandle, + } }; auto pd = device->getPhysicalDevice(); @@ -43,46 +45,12 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor std::make_unique(core::smart_refctd_ptr(this))).memory; } -#if 0 -core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag usage) const +core::smart_refctd_ptr CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const { if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) return nullptr; - auto buf = device->createBuffer({{ - .size = m_params.granularSize, - .usage = usage }, {{ - .postDestroyCleanup = std::make_unique(core::smart_refctd_ptr(this)), - .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - .externalHandle = m_params.osHandle - }}}); - - auto req = buf->getMemoryReqs(); - auto pd = device->getPhysicalDevice(); - switch (m_params.location) - { - case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break; - case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break; - // TODO(Atil): Figure out how to handle these - case CU_MEM_LOCATION_TYPE_HOST_NUMA: - case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: - default: break; - } - - if (!device->allocate(req, buf.get()).isValid()) - return nullptr; - - return buf; -} - -#endif - -core::smart_refctd_ptr CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const -{ - if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) - return nullptr; - - auto img = device->createImage(std::move(params)); + auto img = device->createImage({ std::move(params), { {.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE } }, IGPUImage::TILING::LINEAR }); if (exportAsMemory(device, img.get())) return img; diff --git a/src/nbl/video/CVulkanImage.cpp b/src/nbl/video/CVulkanImage.cpp index 748f21720b..ff5e2dfcb9 100644 --- a/src/nbl/video/CVulkanImage.cpp +++ b/src/nbl/video/CVulkanImage.cpp @@ -9,7 +9,6 @@ namespace nbl::video CVulkanImage::~CVulkanImage() { preDestroyStep(); - // don't destroy imported handles if (!m_cachedCreationParams.skipHandleDestroy) { const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index a5b885849b..afedf60786 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -45,7 +45,7 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(ISemaphore::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams&& params) { VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; VkExportSemaphoreWin32HandleInfoKHR handleInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL }; @@ -54,7 +54,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(ISemaph VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; type.pNext = params.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; - type.initialValue = params.initialValue; + type.initialValue = initialValue; VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, &type }; createInfo.flags = static_cast(0); // flags must be 0 @@ -150,16 +150,15 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) return ret; - const core::bitflag allocateFlags(info.flags); VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { - if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) + if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo }; - vk_allocateInfo.allocationSize = info.size; + vk_allocateInfo.allocationSize = info.allocationSize; vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkImportMemoryWin32HandleInfoKHR importInfo = { @@ -168,13 +167,26 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca .handle = info.externalHandle }; + VkExportMemoryWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL, + }; + + VkExportMemoryAllocateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, + .pNext = &exportInfo, + .handleTypes = static_cast(info.externalHandleType), + }; + const void** pNext = &vk_allocateFlagsInfo.pNext; if (info.externalHandleType) { - // Importing - *pNext = &importInfo; - pNext = &importInfo.pNext; + if (info.externalHandle) //importing + *pNext = &importInfo; + else // exporting + *pNext = &exportInfo; + pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; } if(info.dedication) @@ -207,15 +219,8 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - CVulkanMemoryAllocation::SCreationParams params = { - .allocateFlags = allocateFlags, - .memoryPropertyFlags = memoryPropertyFlags, - .externalHandleType = info.externalHandleType, - .externalHandle = info.externalHandle, - .dedicated = !!info.dedication, - .allocationSize = info.size, - }; - + CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; + ret.memory = core::make_smart_refctd_ptr(this,vk_deviceMemory, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) @@ -334,7 +339,7 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin } -core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) +core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkBufferDeviceAddressCreateInfoEXT, VkBufferOpaqueCaptureAddressCreateInfo, VkDedicatedAllocationBufferCreateInfoNV, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR @@ -344,9 +349,8 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB .handleTypes = creationParams.externalHandleTypes.value, }; - const bool external = creationParams.externalHandleTypes.value; - vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr; + vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); @@ -354,26 +358,6 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB vk_createInfo.queueFamilyIndexCount = creationParams.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = creationParams.queueFamilyIndices; - bool dedicatedOnly = false; - - if (external) - { - core::bitflag requestedTypes = creationParams.externalHandleTypes; - - while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) - { - const auto handleType = static_cast(1u << (idx - 1)); - requestedTypes ^= handleType; - - auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); - - if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types - return nullptr; - - // TODO: Handle this - dedicatedOnly = props.dedicatedOnly; - } - } VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) @@ -397,15 +381,13 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im return nullptr; } -core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) { VkExternalMemoryImageCreateInfo externalMemoryInfo = { .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, .handleTypes = params.externalHandleTypes.value, }; - - const bool external = params.externalHandleTypes.value; - + VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, &externalMemoryInfo }; vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value, true); @@ -434,35 +416,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED; - bool dedicatedOnly = false; - if (external) - { - core::bitflag requestedTypes = params.externalHandleTypes; - auto pd = dynamic_cast(m_physicalDevice)->getInternalObject(); - while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) - { - const auto handleType = static_cast(1u << (idx - 1)); - requestedTypes ^= handleType; - - auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType); - - if (props.maxArrayLayers < vk_createInfo.arrayLayers || - !core::bitflag(props.sampleCounts).hasFlags(params.samples) || - /* props.maxResourceSize?? */ - props.maxExtent.width < vk_createInfo.extent.width || - props.maxExtent.height < vk_createInfo.extent.height || - props.maxExtent.depth < vk_createInfo.extent.depth) - { - return nullptr; - } - - if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types - return nullptr; - - dedicatedOnly |= props.dedicatedOnly; - } - } - + VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev, &vk_createInfo, nullptr, &vk_image) != VK_SUCCESS) return nullptr; diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 0df38ffd67..f18fb3dad4 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -52,7 +52,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice return CVulkanQueue::getResultFrom(m_devf.vk.vkDeviceWaitIdle(m_vkdev)); } - core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&&) override; + core::smart_refctd_ptr createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams &&) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; @@ -103,9 +103,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override; // descriptor creation - core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override; + core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override; core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) override; - core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) override; + core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override; core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) override; VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr); inline core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index ecfdaa6f42..62dcde7d42 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1204,9 +1204,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart if (isExtensionSupported(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME)) properties.limits.cooperativeMatrixRobustness = cooperativeMatrixFeatures.robustness; #endif - properties.limits.externalFenceWin32 = isExtensionSupported(VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME); - properties.limits.externalMemoryWin32 = isExtensionSupported(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); - properties.limits.externalSemaphoreWin32 = isExtensionSupported(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); + } // we compare all limits against the defaults easily! diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index b919e0333a..4eb12e30e3 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -305,7 +305,7 @@ bool IGPUCommandBuffer::waitEvents(const std::span events, const SEvent bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) { - if (!checkStateBeforeRecording(~queue_flags_t::NONE)) + if (!checkStateBeforeRecording(/*everything is allowed*/)) return false; if (depInfo.memBarriers.empty() && depInfo.bufBarriers.empty() && depInfo.imgBarriers.empty()) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 97030ccbba..5ac47d81d7 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -647,4 +647,74 @@ bool ILogicalDevice::createGraphicsPipelines( if (!output[i]) return false; return true; +} + +core::smart_refctd_ptr ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams) +{ + const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; + if (creationParams.size > maxSize) + { + m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) + { + const auto handleType = static_cast(1u << (idx - 1)); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types + return nullptr; + + dedicatedOnly |= props.dedicatedOnly; + } + } + return createBuffer_impl(std::move(creationParams), dedicatedOnly); +} + +core::smart_refctd_ptr ILogicalDevice::createImage(IGPUImage::SCreationParams&& params) +{ + if (!IGPUImage::validateCreationParameters(params)) + { + m_logger.log("Failed to create Image, invalid creation parameters!", system::ILogger::ELL_ERROR); + return nullptr; + } + + const bool external = params.externalHandleTypes.value; + bool dedicatedOnly = false; + if (external) + { + core::bitflag requestedTypes = params.externalHandleTypes; + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) + { + const auto handleType = static_cast(1u << (idx - 1)); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType); + + if (props.maxArrayLayers < params.arrayLayers || + !core::bitflag(props.sampleCounts).hasFlags(params.samples) || + /* props.maxResourceSize?? */ + props.maxExtent.width < params.extent.width || + props.maxExtent.height < params.extent.height || + props.maxExtent.depth < params.extent.depth) + { + return nullptr; + } + + if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types + return nullptr; + + dedicatedOnly |= props.dedicatedOnly; + } + } + // TODO: @Cyprian validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage + return createImage_impl(std::move(params), dedicatedOnly); } \ No newline at end of file From cbd18f482ce0a32c513f4e5ee95281b7a157b0a2 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 18 Jan 2024 23:28:02 +0300 Subject: [PATCH 58/62] add missing cuda fn & map queue indices to vk --- include/nbl/video/CCUDAHandler.h | 1 + src/nbl/video/CVulkanCommandBuffer.cpp | 28 ++++++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index dbad47877d..44b6766e40 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -136,6 +136,7 @@ class CCUDAHandler : public core::IReferenceCounted ,cuDestroyExternalSemaphore ,cuImportExternalSemaphore ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index 2b1f9d9070..64ec5f68c0 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -48,25 +48,41 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkBufferMemoryBarrier2-buffer-04088 if (concurrentSharing) selfQueueFamilyIndex = IQueue::FamilyIgnored; + + auto mapQFIdx = [](uint32_t idx) + { + switch (idx) + { + case IQueue::FamilyExternal: + case IQueue::FamilyIgnored: + case IQueue::FamilyForeign: + idx |= 1u << 31; + break; + } + return idx; + }; + if constexpr (!std::is_same_v) { - out.srcQueueFamilyIndex = selfQueueFamilyIndex; - out.dstQueueFamilyIndex = selfQueueFamilyIndex; + out.srcQueueFamilyIndex = mapQFIdx(selfQueueFamilyIndex); + out.dstQueueFamilyIndex = mapQFIdx(selfQueueFamilyIndex); } const asset::SMemoryBarrier* memoryBarrier; if constexpr (std::is_same_v) { memoryBarrier = &in.dep; // in.otherQueueFamilyIndex==selfQueueFamilyIndex not resulting in ownership transfer is implicit - if (!concurrentSharing && in.otherQueueFamilyIndex!=IQueue::FamilyIgnored) - switch (in.ownershipOp) + if (!concurrentSharing && in.otherQueueFamilyIndex != IQueue::FamilyIgnored) { + switch (in.ownershipOp) + { case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE: - out.dstQueueFamilyIndex = in.otherQueueFamilyIndex; + out.dstQueueFamilyIndex = mapQFIdx(in.otherQueueFamilyIndex); break; case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE: - out.srcQueueFamilyIndex = in.otherQueueFamilyIndex; + out.srcQueueFamilyIndex = mapQFIdx(in.otherQueueFamilyIndex); break; + } } } else From 23fe8d4518452028c36f2343886b51c9d08ff6c1 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 18 Jan 2024 23:28:13 +0300 Subject: [PATCH 59/62] update submodule --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 8cd78a71f4..744dd44c3b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8cd78a71f4a03c7ace9df2ee9b9df07317779909 +Subproject commit 744dd44c3bd6d5bb5734402b85f49fd0e27a46cc From c32fd793f028f3c8974828f5df605e281bb0ce1d Mon Sep 17 00:00:00 2001 From: atkurtul Date: Fri, 19 Jan 2024 01:12:01 +0300 Subject: [PATCH 60/62] cache cuda devices --- examples_tests | 2 +- include/nbl/video/CCUDAHandler.h | 28 +++-- src/nbl/video/CCUDAHandler.cpp | 207 +++++++++++++++++-------------- src/nbl/video/ILogicalDevice.cpp | 21 ++-- 4 files changed, 147 insertions(+), 111 deletions(-) diff --git a/examples_tests b/examples_tests index 744dd44c3b..9897e115e7 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 744dd44c3bd6d5bb5734402b85f49fd0e27a46cc +Subproject commit 9897e115e726052662596ba6915c5438ebd51030 diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 44b6766e40..022024e856 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -182,6 +182,18 @@ class CCUDAHandler : public core::IReferenceCounted return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -217,6 +229,7 @@ class CCUDAHandler : public core::IReferenceCounted result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); } + inline ptx_and_nvrtcResult_t compileDirectlyToPTX( const char* source, const char* filename, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, @@ -225,6 +238,7 @@ class CCUDAHandler : public core::IReferenceCounted { return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); } + inline ptx_and_nvrtcResult_t compileDirectlyToPTX( system::IFile* file, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, @@ -245,16 +259,8 @@ class CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + ~CCUDAHandler() = default; @@ -284,6 +290,8 @@ class CCUDAHandler : public core::IReferenceCounted core::vector m_headerNames; system::logger_opt_smart_ptr m_logger; int m_version; + + core::vector m_availableDevices; }; } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 09c2fbe14e..2789bed2a6 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -11,7 +11,49 @@ namespace nbl::video { - + +CCUDAHandler::CCUDAHandler( + CUDA&& _cuda, + NVRTC&& _nvrtc, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_cuda(std::move(_cuda)) + , m_nvrtc(std::move(_nvrtc)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{ + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + return; + + for (int ordinal = 0; ordinal < deviceCount; ordinal++) + { + CUdevice handle = -1; + if (m_cuda.pcuDeviceGet(&handle, ordinal) != CUDA_SUCCESS || handle < 0) + continue; + + CUuuid uuid = {}; + if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS) + continue; + + m_availableDevices.emplace_back(handle, uuid); + + int* attributes = m_availableDevices.back().attributes; + for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) + m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + + } +} + bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) @@ -527,110 +569,95 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) - return nullptr; - - for (int ordinal=0; ordinalgetProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid, &physicalDevice->getProperties().deviceUUID, VK_UUID_SIZE)) { - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - for (int i=0; i(i),handle); - CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; - const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; - const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; + const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; + const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; switch (archMajor) { - case 3: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_30; - break; - case 2: - arch = CCUDADevice::EVA_32; - break; - case 5: - arch = CCUDADevice::EVA_35; - break; - case 7: - arch = CCUDADevice::EVA_37; - break; - default: - break; - } + case 3: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_30; break; - case 5: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_50; - break; - case 2: - arch = CCUDADevice::EVA_52; - break; - case 3: - arch = CCUDADevice::EVA_53; - break; - default: - break; - } + case 2: + arch = CCUDADevice::EVA_32; break; - case 6: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_60; - break; - case 1: - arch = CCUDADevice::EVA_61; - break; - case 2: - arch = CCUDADevice::EVA_62; - break; - default: - break; - } + case 5: + arch = CCUDADevice::EVA_35; break; case 7: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_70; - break; - case 2: - arch = CCUDADevice::EVA_72; - break; - case 5: - arch = CCUDADevice::EVA_75; - break; - default: - break; - } + arch = CCUDADevice::EVA_37; + break; + default: + break; + } + break; + case 5: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_50; + break; + case 2: + arch = CCUDADevice::EVA_52; + break; + case 3: + arch = CCUDADevice::EVA_53; break; default: - if (archMajor>=8) - arch = CCUDADevice::EVA_80; break; + } + break; + case 6: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_60; + break; + case 1: + arch = CCUDADevice::EVA_61; + break; + case 2: + arch = CCUDADevice::EVA_62; + break; + default: + break; + } + break; + case 7: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_70; + break; + case 2: + arch = CCUDADevice::EVA_72; + break; + case 5: + arch = CCUDADevice::EVA_75; + break; + default: + break; + } + break; + default: + if (archMajor >= 8) + arch = CCUDADevice::EVA_80; + break; } - if (arch==CCUDADevice::EVA_COUNT) + if (arch == CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,handle,core::smart_refctd_ptr(this)); - return core::smart_refctd_ptr(device,core::dont_grab); - } - } + return core::smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)), core::dont_grab); + } + } + return nullptr; } diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 5ac47d81d7..69460619fe 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -111,7 +111,7 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag return getSupportedStageMask(queueFamilyIndex).hasFlags(stageMask); } -bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag stageMask) const +bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag accesMask) const { if (queueFamilyIndex>m_queueFamilyInfos->size()) return false; @@ -119,15 +119,16 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag const auto& familyProps = m_physicalDevice->getQueueFamilyProperties()[queueFamilyIndex].queueFlags; const bool shaderCapableFamily = bool(familyProps&(q_family_flags_t::COMPUTE_BIT|q_family_flags_t::GRAPHICS_BIT)); // strip special values - if (stageMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS)) - stageMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS; - else if (stageMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily) - stageMask ^= asset::ACCESS_FLAGS::SHADER_READ_BITS; - if (stageMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_WRITE_BITS)) - stageMask ^= asset::ACCESS_FLAGS::MEMORY_WRITE_BITS; - else if (stageMask.hasFlags(asset::ACCESS_FLAGS::SHADER_WRITE_BITS) && shaderCapableFamily) - stageMask ^= asset::ACCESS_FLAGS::SHADER_WRITE_BITS; - return getSupportedAccessMask(queueFamilyIndex).hasFlags(stageMask); + VK_ACCESS_SHADER_WRITE_BIT; + if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS)) + accesMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS; + else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily) + accesMask ^= asset::ACCESS_FLAGS::SHADER_READ_BITS; + if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_WRITE_BITS)) + accesMask ^= asset::ACCESS_FLAGS::MEMORY_WRITE_BITS; + else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_WRITE_BITS) && shaderCapableFamily) + accesMask ^= asset::ACCESS_FLAGS::SHADER_WRITE_BITS; + return getSupportedAccessMask(queueFamilyIndex).hasFlags(accesMask); } bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const From 4e2185c1e3197aada9095c0ca32c46bcc01a1dda Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 20 Jan 2024 01:25:13 +0300 Subject: [PATCH 61/62] ifdef platform code --- examples_tests | 2 +- include/nbl/video/EApiType.h | 9 ++ include/nbl/video/IDeviceMemoryAllocation.h | 5 +- include/nbl/video/IDeviceMemoryBacked.h | 2 - include/nbl/video/IPhysicalDevice.h | 1 - include/nbl/video/ISemaphore.h | 2 +- src/nbl/video/CCUDASharedMemory.cpp | 8 -- src/nbl/video/CVulkanImage.cpp | 2 + src/nbl/video/CVulkanLogicalDevice.cpp | 152 +++++++++++++++++--- src/nbl/video/CVulkanMemoryAllocation.cpp | 6 + src/nbl/video/ILogicalDevice.cpp | 1 - 11 files changed, 151 insertions(+), 39 deletions(-) diff --git a/examples_tests b/examples_tests index 9897e115e7..73f147941e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9897e115e726052662596ba6915c5438ebd51030 +Subproject commit 73f147941ef5362d0adee47ae72b4088b8c49aa5 diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index e670dc90d8..275e3f0a7a 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,15 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; + +using ExternalHandleType = +#ifdef _WIN32 +void* +#else +int +#endif +; + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 9ca663b9ea..d162a029be 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -172,7 +172,7 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE - void* externalHandle = nullptr; + ExternalHandleType externalHandle = 0; }; struct SCreationParams: SInfo @@ -180,6 +180,8 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; const bool dedicated = false; }; + + inline const SCreationParams& getCreationParams() const { return m_params; } protected: inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) @@ -199,7 +201,6 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - const ILogicalDevice* m_originDevice = nullptr; SCreationParams m_params = {}; uint8_t* m_mappedPtr = nullptr; diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index 0071a53d71..c5c28ad717 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -123,11 +123,9 @@ class IDeviceMemoryBacked : public IBackendObject m_cachedCreationParams.preDestroyCleanup = nullptr; } - //! members SCachedCreationParams m_cachedCreationParams; const SDeviceMemoryRequirements m_cachedMemoryReqs; - void* m_cachedExternalHandle = nullptr; }; } // end namespace nbl::video diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index e32a65d9f3..870a435f5e 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -28,7 +28,6 @@ namespace nbl::video - class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable { template static constexpr bool is_bitflag = false; diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 5434591fb6..07506067af 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -68,7 +68,7 @@ class ISemaphore : public IBackendObject core::bitflag externalHandleTypes = EHT_NONE; //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE - void* externalHandle = nullptr; + ExternalHandleType externalHandle = nullptr; }; auto const& getCreationParams() const diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 6510967271..a5b8011920 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -10,14 +10,6 @@ namespace nbl::video core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { - IDeviceMemoryAllocator::SAllocateInfo info = { - { - .allocationSize = m_params.granularSize, - .externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - .externalHandle = m_params.osHandle, - } - }; - auto pd = device->getPhysicalDevice(); uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); diff --git a/src/nbl/video/CVulkanImage.cpp b/src/nbl/video/CVulkanImage.cpp index ff5e2dfcb9..72e9dc62fc 100644 --- a/src/nbl/video/CVulkanImage.cpp +++ b/src/nbl/video/CVulkanImage.cpp @@ -9,6 +9,8 @@ namespace nbl::video CVulkanImage::~CVulkanImage() { preDestroyStep(); + // e.g. don't destroy imported handles from the same VkInstance (e.g. if hooking into external Vulkan codebase) + // truly EXTERNAL_MEMORY imported handles, do need to be destroyed + CloseHandled (separate thing) if (!m_cachedCreationParams.skipHandleDestroy) { const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index afedf60786..607aa69caa 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -47,12 +47,38 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams&& params) { - VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; - VkExportSemaphoreWin32HandleInfoKHR handleInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL }; - VkExportSemaphoreCreateInfo exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, &handleInfo, static_cast(params.externalHandleTypes.value) }; +#ifdef _WIN32 + VkImportSemaphoreWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(params.externalHandleTypes.value), + .handle = params.externalHandle, + }; + VkExportSemaphoreWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL + }; +#else + VkImportSemaphoreFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR, + .handleType = static_cast(params.externalHandleTypes.value), + .fd = params.externalHandle, + }; +#endif + + VkExportSemaphoreCreateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(params.externalHandleTypes.value) + }; + + + const bool importing = params.externalHandleTypes.value && params.externalHandle; + const bool exporting = params.externalHandleTypes.value && !params.externalHandle; VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = params.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = exporting ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; type.initialValue = initialValue; @@ -63,18 +89,27 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(uint64_ if (VK_SUCCESS != m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore)) return nullptr; - if (params.externalHandleTypes.value) + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = static_cast(params.externalHandleTypes.value), + }; + +#ifdef _WIN32 + auto importfn = m_devf.vk.vkImportSemaphoreWin32HandleKHR; + auto exportfn = m_devf.vk.vkGetSemaphoreWin32HandleKHR; +#else + auto importfn = m_devf.vk.vkImportSemaphoreFdKHR; + auto exportfn = m_devf.vk.vkGetSemaphoreFdKHR; +#endif + + if ( + (importing && (VK_SUCCESS != importfn(m_vkdev, &importInfo))) || + (exporting && (VK_SUCCESS != exportfn(m_vkdev, &props, ¶ms.externalHandle))) + ) { - VkSemaphoreGetWin32HandleInfoKHR props = { - .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, - .semaphore = semaphore, - .handleType = static_cast(params.externalHandleTypes.value), - }; - if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, ¶ms.externalHandle)) - { - m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); - return nullptr; - } + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + return nullptr; } return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), semaphore, std::move(params)); @@ -143,12 +178,28 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } +void* DupeHandle(uint64_t pid, void* handle) +{ +#ifdef _WIN32 + DWORD flags; + HANDLE re = 0; + + HANDLE cur = GetCurrentProcess(); + HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur; + + if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return 0; + + CloseHandle(src); + return re; +#endif + return handle; +} IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { - IDeviceMemoryAllocator::SAllocation ret = {}; if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) - return ret; + return {}; VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { @@ -161,6 +212,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca vk_allocateInfo.allocationSize = info.allocationSize; vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; +#ifdef _WIN32 VkImportMemoryWin32HandleInfoKHR importInfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, .handleType = static_cast(info.externalHandleType), @@ -171,10 +223,19 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL, }; +#else + VkImportMemoryFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .fd = (int)info.externalHandle, + }; +#endif VkExportMemoryAllocateInfo exportInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, - .pNext = &exportInfo, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif .handleTypes = static_cast(info.externalHandleType), }; @@ -183,7 +244,11 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca if (info.externalHandleType) { if (info.externalHandle) //importing + { + auto duped = DupeHandle(0, info.externalHandle); + const_cast(info.externalHandle) = duped; *pNext = &importInfo; + } else // exporting *pNext = &exportInfo; pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; @@ -206,7 +271,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; default: assert(false); - return ret; + return {}; break; } } @@ -214,15 +279,57 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); if (vk_res!=VK_SUCCESS) - return ret; + return {}; + + const bool exported = info.externalHandleType && !info.externalHandle; + + if (exported) + { +#ifdef _WIN32 + VkMemoryGetWin32HandleInfoKHR +#else + VkMemoryGetFdInfoKHR +#endif + handleInfo = { .sType = +#ifdef _WIN32 + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#else + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#endif + .memory = vk_deviceMemory, + .handleType = static_cast(info.externalHandleType), + }; + + /* + For handle types defined as NT handles, + the handles returned by vkGetMemoryWin32HandleKHR are owned by the application + and hold a reference to their payload. To avoid leaking resources, + the application must release ownership of them + using the CloseHandle system call when they are no longer needed. + */ + + if (VK_SUCCESS != m_devf.vk. +#ifdef _WIN32 + vkGetMemoryWin32HandleKHR +#else + vkGetMemoryFdKHR +#endif + (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + { + m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); + return {}; + } + + } // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; - - ret.memory = core::make_smart_refctd_ptr(this,vk_deviceMemory, std::move(params)); + IDeviceMemoryAllocator::SAllocation ret = {}; + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator + if(info.dedication) { bool dedicationSuccess = false; @@ -349,7 +456,6 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB .handleTypes = creationParams.externalHandleTypes.value, }; - vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index fb214c897e..7597e33717 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -16,6 +16,12 @@ CVulkanMemoryAllocation::CVulkanMemoryAllocation( CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { + if (m_params.externalHandle) + { + bool re = CloseHandle(getCreationParams().externalHandle); + assert(re); + } + m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 69460619fe..2902ff7509 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -119,7 +119,6 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag const auto& familyProps = m_physicalDevice->getQueueFamilyProperties()[queueFamilyIndex].queueFlags; const bool shaderCapableFamily = bool(familyProps&(q_family_flags_t::COMPUTE_BIT|q_family_flags_t::GRAPHICS_BIT)); // strip special values - VK_ACCESS_SHADER_WRITE_BIT; if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS)) accesMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS; else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily) From bd0b76a341d86034474cbf67c266a9b25e80e81d Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 20 Jan 2024 01:31:58 +0300 Subject: [PATCH 62/62] log queue validation warning --- include/nbl/video/IQueue.h | 19 +++++++++++++------ src/nbl/video/IQueue.cpp | 9 ++++++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h index 654d95a847..4000fcd2a3 100644 --- a/include/nbl/video/IQueue.h +++ b/include/nbl/video/IQueue.h @@ -68,7 +68,7 @@ class IQueue : public core::Interface, public core::Unmovable { SUCCESS, DEVICE_LOST, - OTHER_ERROR + OTHER_ERROR, }; // struct SSubmitInfo @@ -92,16 +92,23 @@ class IQueue : public core::Interface, public core::Unmovable std::span commandBuffers = {}; std::span signalSemaphores = {}; - inline bool valid() const + enum Validity + { + INVALID, + VALID, + WORK_WITHOUT_SYNC, + }; + + inline Validity valid() const { // any two being empty is wrong if (commandBuffers.empty() && signalSemaphores.empty()) // wait and do nothing - return false; + return INVALID; if (waitSemaphores.empty() && signalSemaphores.empty()) // work without sync - return false; + return WORK_WITHOUT_SYNC; if (waitSemaphores.empty() && commandBuffers.empty()) // signal without doing work first - return false; - return true; + return INVALID; + return VALID; } }; virtual RESULT submit(const std::span _submits); diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index e75e7b2cad..2527562bac 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -13,8 +13,15 @@ auto IQueue::submit(const std::span _submits) -> RESULT auto* logger = m_originDevice->getPhysicalDevice()->getDebugCallback()->getLogger(); for (const auto& submit : _submits) { - if (!submit.valid()) + switch (submit.valid()) + { + case SSubmitInfo::INVALID: return RESULT::OTHER_ERROR; + case SSubmitInfo::WORK_WITHOUT_SYNC: + logger->log("Work withouth sync!", system::ILogger::ELL_WARNING); + default: + break; + } auto invalidSemaphores = [this,logger](const std::span semaphoreInfos) -> bool {