diff --git a/3rdparty/jitify b/3rdparty/jitify index 0d6dbd8ccd..1a0ca0e837 160000 --- a/3rdparty/jitify +++ b/3rdparty/jitify @@ -1 +1 @@ -Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a +Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96 diff --git a/examples_tests b/examples_tests index 9e980e729b..73f147941e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9e980e729bb6e2813d0b2a60e20c182b837d4fce +Subproject commit 73f147941ef5362d0adee47ae72b4088b8c49aa5 diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index e11d8faf7d..d50a415e69 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -42,6 +42,8 @@ class IBuffer : public core::IBuffer, public IDescriptor //! synthetic Nabla inventions // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, + + EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 1120224fdb..551c2a7e5b 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,7 +6,8 @@ #include "nbl/video/IPhysicalDevice.h" - +#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDASharedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -23,10 +24,27 @@ namespace nbl::video { class CCUDAHandler; +class CCUDASharedMemory; +class CCUDASharedSemaphore; class CCUDADevice : public core::IReferenceCounted { public: +#ifdef _WIN32 + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; +#else + static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; + static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif + struct SCUDACleaner : video::ICleanup + { + core::smart_refctd_ptr resource; + SCUDACleaner(core::smart_refctd_ptr resource) + : resource(std::move(resource)) + { } + }; + enum E_VIRTUAL_ARCHITECTURE { EVA_30, @@ -72,127 +90,37 @@ class CCUDADevice : public core::IReferenceCounted // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability // Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions) // Also maybe separate this out into its own `CCUDA` class instead of nesting it here? -#if 0 - template - struct GraphicsAPIObjLink - { - GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false) - { - asImage = {nullptr}; - } - GraphicsAPIObjLink(core::smart_refctd_ptr&& _obj) : GraphicsAPIObjLink() - { - obj = std::move(_obj); - } - GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink() - { - operator=(std::move(other)); - } - - GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other) - { - std::swap(obj,other.obj); - std::swap(cudaHandle,other.cudaHandle); - std::swap(acquired,other.acquired); - std::swap(asImage,other.asImage); - return *this; - } - - ~GraphicsAPIObjLink() - { - assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream - if (obj) - CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle); - } - - // - auto* getObject() const {return obj.get();} - - private: - core::smart_refctd_ptr obj; - CUgraphicsResource cudaHandle; - bool acquired; - - friend class CCUDAHandler; - public: - union - { - struct - { - CUdeviceptr pointer; - } asBuffer; - struct - { - CUmipmappedArray mipmappedArray; - CUarray array; - } asImage; - }; - }; - // - static CUresult registerBuffer(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - static CUresult registerImage(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); + CUdevice getInternalObject() const { return m_handle; } + const CCUDAHandler* getHandler() const { return m_handler.get(); } + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + size_t roundToGranularity(CUmemLocationType location, size_t size) const; - template - static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsMapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = true; - return retval; - } - template - static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = false; - return retval; - } + protected: + CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); - static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); - static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); -#endif - protected: + // CUDAHandler creates CUDADevice, it needs to access ctor friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); - ~CCUDADevice() = default; + + CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr&& _handler); + ~CCUDADevice(); std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; IPhysicalDevice* const m_vulkanDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + core::smart_refctd_ptr m_handler; + CUdevice m_handle; + CUcontext m_context; + size_t m_allocationGranularity[4]; }; } #endif // _NBL_COMPILE_WITH_CUDA_ -#endif +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..022024e856 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -34,7 +34,7 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; @@ -119,6 +119,24 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -157,13 +175,25 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -199,6 +229,7 @@ class CCUDAHandler : public core::IReferenceCounted result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); } + inline ptx_and_nvrtcResult_t compileDirectlyToPTX( const char* source, const char* filename, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, @@ -207,6 +238,7 @@ class CCUDAHandler : public core::IReferenceCounted { return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); } + inline ptx_and_nvrtcResult_t compileDirectlyToPTX( system::IFile* file, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, @@ -226,20 +258,12 @@ class CCUDAHandler : public core::IReferenceCounted } core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); +protected: + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } ~CCUDAHandler() = default; - + + // inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) { @@ -266,10 +290,12 @@ class CCUDAHandler : public core::IReferenceCounted core::vector m_headerNames; system::logger_opt_smart_ptr m_logger; int m_version; + + core::vector m_availableDevices; }; } #endif // _NBL_COMPILE_WITH_CUDA_ -#endif +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h new file mode 100644 index 0000000000..d900087d06 --- /dev/null +++ b/include/nbl/video/CCUDASharedMemory.h @@ -0,0 +1,71 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ + + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDASharedMemory : public core::IReferenceCounted +{ +public: + // required for us to see the move ctor + friend class CCUDADevice; + + CUdeviceptr getDeviceptr() const { return m_params.ptr; } + + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; + + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + union + { + void* osHandle; + int fd; + }; + }; + + const SCreationParams& getCreationParams() const { return m_params; } + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + core::smart_refctd_ptr createAndBindImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; + +protected: + + CCUDASharedMemory(core::smart_refctd_ptr&& device, SCachedCreationParams&& params) + : m_device(std::move(device)) + , m_params(std::move(params)) + {} + ~CCUDASharedMemory() override; + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h new file mode 100644 index 0000000000..882e794bd4 --- /dev/null +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDASharedSemaphore : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUexternalSemaphore getInternalObject() const { return m_handle; } + +protected: + + CCUDASharedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, CUexternalSemaphore semaphore, void* osHandle) + : m_device(std::move(device)) + , m_src(std::move(m_src)) + , m_handle(semaphore) + , m_osHandle(osHandle) + {} + ~CCUDASharedSemaphore() override; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; + void* m_osHandle; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index c996000e04..2505de6865 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface protected: // special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here) CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle); - CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) : - CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {} + CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) : + CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {} private: - static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle); + static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle); core::smart_refctd_ptr m_memory = nullptr; size_t m_offset = 0u; diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index e670dc90d8..275e3f0a7a 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,15 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; + +using ExternalHandleType = +#ifdef _WIN32 +void* +#else +int +#endif +; + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 673f1834e6..d162a029be 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,6 +24,8 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted { + friend class IDeviceMemoryAllocator; + friend class ILogicalDevice; public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) @@ -68,6 +70,19 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted EMHF_MULTI_INSTANCE_BIT = 0x00000002, }; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D11_TEXTURE = 0x00000008, + EHT_D3D11_TEXTURE_KMT = 0x00000010, + EHT_D3D12_HEAP = 0x00000020, + EHT_D3D12_RESOURCE = 0x00000040, + EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + }; + // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} @@ -75,25 +90,25 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. - inline bool isDedicated() const {return m_dedicated;} + inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation - inline size_t getAllocationSize() const {return m_allocationSize;} + inline size_t getAllocationSize() const {return m_params.allocationSize;} //! - inline core::bitflag getAllocateFlags() const { return m_allocateFlags; } + inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! - inline core::bitflag getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } + inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) - inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} + inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more) for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */ inline bool haveToMakeVisible() const { - return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); + return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); } //! @@ -106,9 +121,9 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted { if (isCurrentlyMapped()) return nullptr; - if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) + if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) return nullptr; - if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) + if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) return nullptr; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) @@ -149,23 +164,49 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } + struct SInfo + { + uint64_t allocationSize = 0; + core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE + ExternalHandleType externalHandle = 0; + }; + + struct SCreationParams: SInfo + { + core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; + const bool dedicated = false; + }; + + inline const SCreationParams& getCreationParams() const { return m_params; } + protected: - inline IDeviceMemoryAllocation( - const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag allocateFlags, const core::bitflag memoryPropertyFlags, const bool dedicated - ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {} + inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) + { + m_postDestroyCleanup = std::move(cleanup); + } + + IDeviceMemoryAllocation( + const ILogicalDevice* originDevice, SCreationParams&& params = {}) + : m_originDevice(originDevice) + , m_params(std::move(params)) + , m_mappedPtr(nullptr) + , m_mappedRange{ 0, 0 } + , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) + {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - - const ILogicalDevice* const m_originDevice; - const size_t m_allocationSize; + const ILogicalDevice* m_originDevice = nullptr; + SCreationParams m_params = {}; uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - const core::bitflag m_allocateFlags; - const core::bitflag m_memoryPropertyFlags; - const bool m_dedicated; + std::unique_ptr m_postDestroyCleanup = nullptr; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 0712ec24f6..22ea3c8238 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -12,11 +12,9 @@ namespace nbl::video class IDeviceMemoryAllocator { public: - struct SAllocateInfo + struct SAllocateInfo: IDeviceMemoryAllocation::SInfo { - size_t size : 54 = 0ull; - size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS - size_t memoryTypeIndex : 5 = 0u; + uint32_t memoryTypeIndex = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. }; @@ -27,8 +25,15 @@ class IDeviceMemoryAllocator class IMemoryTypeIterator { public: - IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) - : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs) {} + IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + void* handle) + : m_allocateFlags(static_cast(allocateFlags.value)) + , m_reqs(reqs) + , m_handleType(handleType) + , m_handle(handle) + {} static inline uint32_t end() {return 32u;} @@ -40,11 +45,13 @@ class IDeviceMemoryAllocator inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) { - SAllocateInfo ret; - ret.size = m_reqs.size; - ret.flags = m_allocateFlags; + SAllocateInfo ret = {}; + ret.allocationSize = m_reqs.size; + ret.allocateFlags = core::bitflag(m_allocateFlags); ret.memoryTypeIndex = dereference(); ret.dedication = dedication; + ret.externalHandleType = m_handleType; + ret.externalHandle = m_handle; return ret; } @@ -57,17 +64,24 @@ class IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; + void* m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB class DefaultMemoryTypeIterator : public IMemoryTypeIterator { public: - DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags) + DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + void* handle) + : IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } + protected: uint32_t dereference() const override { @@ -100,19 +114,26 @@ class IDeviceMemoryAllocator }; virtual SAllocation allocate(const SAllocateInfo& info) = 0; - template - inline SAllocation allocate( - const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE) + template + SAllocation allocate( + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + IDeviceMemoryBacked* dedication = nullptr, + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType = IDeviceMemoryAllocation::EHT_NONE, + void* handle = nullptr, + std::unique_ptr&& postDestroyCleanup = nullptr) { - for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, handleType, handle); memTypeIt != IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); - auto allocation = allocate(allocateInfo); + SAllocation allocation = allocate(allocateInfo); if (allocation.isValid()) + { + allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup)); return allocation; + } } - return {}; + return { }; } }; diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index f2b449557c..c5c28ad717 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -19,6 +19,15 @@ namespace nbl::video struct NBL_API2 ICleanup { virtual ~ICleanup() = 0; + + std::unique_ptr next; + + static void chain(std::unique_ptr& first, std::unique_ptr&& next) + { + if (first) + return chain(first->next, std::move(next)); + first = std::move(next); + } }; //! Interface from which resources backed by IDeviceMemoryAllocation inherit from @@ -37,6 +46,8 @@ class IDeviceMemoryBacked : public IBackendObject // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects bool skipHandleDestroy = false; + core::bitflag externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE; + //! If you specify queue family indices, then you're concurrent sharing inline bool isConcurrentSharing() const { @@ -92,7 +103,12 @@ class IDeviceMemoryBacked : public IBackendObject { const uint32_t* queueFamilyIndices = nullptr; }; - + + void chainPreDestroyCleanup(std::unique_ptr first) + { + ICleanup::chain(m_cachedCreationParams.preDestroyCleanup, std::move(first)); + } + protected: inline IDeviceMemoryBacked(core::smart_refctd_ptr&& originDevice, SCreationParams&& creationParams, const SDeviceMemoryRequirements& reqs) : IBackendObject(std::move(originDevice)), m_cachedCreationParams(std::move(creationParams)), m_cachedMemoryReqs(reqs) {} @@ -107,10 +123,9 @@ class IDeviceMemoryBacked : public IBackendObject m_cachedCreationParams.preDestroyCleanup = nullptr; } - //! members SCachedCreationParams m_cachedCreationParams; - SDeviceMemoryRequirements m_cachedMemoryReqs; + const SDeviceMemoryRequirements m_cachedMemoryReqs; }; } // end namespace nbl::video diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 24e1731cab..a102005371 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -147,7 +147,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual IQueue::RESULT waitIdle() const = 0; //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(uint64_t initialValue = 0, ISemaphore::SCreationParams&& = {}) = 0; virtual ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) = 0; // Forever waiting variant if you're confident that the fence will eventually be signalled inline ISemaphore::WAIT_RESULT blockForSemaphores(const std::span infos, const bool waitAll=true) @@ -285,29 +285,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams) - { - const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; - if (creationParams.size>maxSize) - { - m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize); - return nullptr; - } - return createBuffer_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); + // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); + // Creates an Image (@see ICPUImage) - inline core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams) - { - if (!IGPUImage::validateCreationParameters(creationParams)) - { - m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); - return nullptr; - } - // TODO: @Cyprian validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage - return createImage_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& params); + // Create an ImageView that can actually be used by shaders (@see ICPUImageView) inline core::smart_refctd_ptr createImageView(IGPUImageView::SCreationParams&& params) { @@ -765,9 +750,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0; virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0; - virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) = 0; - virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) = 0; + virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 583c8ac9d0..870a435f5e 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -26,8 +26,53 @@ namespace nbl::video { + + class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable { + template static constexpr bool is_bitflag = false; + template static constexpr bool is_bitflag> = true; + + template struct RequestMapTraits; + templatestruct RequestMapTraits : RequestMapTraits {}; + template struct RequestMapTraits + { + using Key = std::tuple...>; + struct Hasher + { + template + static size_t hash(size_t seed, Key const& key) + { + if constexpr (0 == N) + return seed; + else + { + using cur = std::remove_cvref_t(key))>; + + if constexpr (is_bitflag) + core::hash_combine(seed, cur::UNDERLYING_TYPE(std::get(key).value)); + else if constexpr (std::is_convertible_v) + core::hash_combine(seed, size_t(std::get(key))); + else + core::hash_combine(seed, std::get(key)); + + return hash(seed, key); + } + + } + + size_t operator()(Key const& key) const + { + return hash(0, key); + } + }; + + using Map = std::unordered_map; + }; + + template + using RequestMap = typename RequestMapTraits::Map; + public: // virtual E_API_TYPE getAPIType() const = 0; @@ -242,6 +287,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable !! Same goes for `vkGetPhysicalDeviceSparseImageFormatProperties2` */ + struct SFormatBufferUsages { struct SUsage @@ -687,6 +733,82 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return createLogicalDevice_impl(std::move(params)); } + + /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */ + struct SExternalMemoryProperties + { + uint32_t exportableTypes : 7 = ~0u; + uint32_t compatibleTypes : 7 = ~0u; + uint32_t dedicatedOnly : 1 = 0u; + uint32_t exportable : 1 = ~0u; + uint32_t importable : 1 = ~0u; + + bool operator == (SExternalMemoryProperties const& rhs) const = default; + + SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const + { + rhs.exportableTypes &= exportableTypes; + rhs.compatibleTypes &= compatibleTypes; + rhs.dedicatedOnly |= dedicatedOnly; + rhs.exportable &= exportable; + rhs.importable &= importable; + return rhs; + } + }; + + static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); + + struct SImageFormatProperties + { + VkExtent3D maxExtent = {}; + uint32_t maxMipLevels = {}; + uint32_t maxArrayLayers = {}; + IGPUImage::E_SAMPLE_COUNT_FLAGS sampleCounts = IGPUImage::ESCF_1_BIT; + uint64_t maxResourceSize = 0; + + bool operator == (SImageFormatProperties const& rhs) const = default; + }; + + struct SExternalImageFormatProperties : SImageFormatProperties, SExternalMemoryProperties + { + }; + + SExternalMemoryProperties getExternalBufferProperties( + core::bitflag usage, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + usage &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags + { + std::shared_lock lock(m_externalBufferPropertiesMutex); + auto it = m_externalBufferProperties.find({ usage, handleType }); + if (it != m_externalBufferProperties.end()) + return it->second; + } + + std::unique_lock lock(m_externalBufferPropertiesMutex); + return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); + } + + SExternalImageFormatProperties getExternalImageProperties( + asset::E_FORMAT format, + IGPUImage::TILING tiling, + IGPUImage::E_TYPE type, + core::bitflag usage, + core::bitflag flags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + auto key = std::tuple{ format, tiling, type, usage, flags, handleType }; + { + std::shared_lock lock(m_externalImagePropertiesMutex); + auto it = m_externalImageProperties.find(key); + if (it != m_externalImageProperties.end()) + return it->second; + } + + std::unique_lock lock(m_externalImagePropertiesMutex); + return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, type, usage, flags, handleType); + } + protected: struct SInitData final { @@ -745,6 +867,24 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return 220u; // largest from above } + // external memory + /* ExternalBufferProperties *//* provided by VK_KHR_external_memory_capabilities */ + + + virtual SExternalMemoryProperties getExternalBufferProperties_impl(core::bitflag usage, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + mutable RequestMap m_externalBufferProperties; + mutable std::shared_mutex m_externalBufferPropertiesMutex; + + virtual SExternalImageFormatProperties getExternalImageProperties_impl( + asset::E_FORMAT format, + IGPUImage::TILING tiling, + IGPUImage::E_TYPE type, + core::bitflag usage, + core::bitflag flags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + mutable RequestMap m_externalImageProperties; + mutable std::shared_mutex m_externalImagePropertiesMutex; + // Format Promotion struct SBufferFormatPromotionRequestHash { diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h index 654d95a847..4000fcd2a3 100644 --- a/include/nbl/video/IQueue.h +++ b/include/nbl/video/IQueue.h @@ -68,7 +68,7 @@ class IQueue : public core::Interface, public core::Unmovable { SUCCESS, DEVICE_LOST, - OTHER_ERROR + OTHER_ERROR, }; // struct SSubmitInfo @@ -92,16 +92,23 @@ class IQueue : public core::Interface, public core::Unmovable std::span commandBuffers = {}; std::span signalSemaphores = {}; - inline bool valid() const + enum Validity + { + INVALID, + VALID, + WORK_WITHOUT_SYNC, + }; + + inline Validity valid() const { // any two being empty is wrong if (commandBuffers.empty() && signalSemaphores.empty()) // wait and do nothing - return false; + return INVALID; if (waitSemaphores.empty() && signalSemaphores.empty()) // work without sync - return false; + return WORK_WITHOUT_SYNC; if (waitSemaphores.empty() && commandBuffers.empty()) // signal without doing work first - return false; - return true; + return INVALID; + return VALID; } }; virtual RESULT submit(const std::span _submits); diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index b0e0452850..07506067af 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -44,9 +44,46 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0x00000000, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D12_FENCE = 0x00000008, + EHT_SYNC_FD = 0x00000010, + }; + + //! + struct SCreationParams + { + // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources + std::unique_ptr preDestroyCleanup = nullptr; + // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources + std::unique_ptr postDestroyCleanup = nullptr; + // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects + bool skipHandleDestroy = false; + // Handle Type for external resources + core::bitflag externalHandleTypes = EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE + ExternalHandleType externalHandle = nullptr; + }; + + auto const& getCreationParams() const + { + return m_creationParams; + } + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev) : IBackendObject(std::move(dev)) {} + ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& params = {}) + : IBackendObject(std::move(dev)) + , m_creationParams(std::move(params)) + {} virtual ~ISemaphore() = default; + + const SCreationParams m_creationParams; }; } diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h index ebb3e0dcbd..b639f37230 100644 --- a/include/nbl/video/SPhysicalDeviceLimits.h +++ b/include/nbl/video/SPhysicalDeviceLimits.h @@ -552,7 +552,6 @@ struct SPhysicalDeviceLimits /* CooperativeMatrixPropertiesKHR *//* VK_KHR_cooperative_matrix */ core::bitflag cooperativeMatrixSupportedStages = asset::IShader::ESS_UNKNOWN; - /* Always enabled if available, reported as limits */ // Core 1.0 Features diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 492a1db027..d91fe09107 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -234,7 +234,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted //! WARNING: This function blocks CPU and stalls the GPU! inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function& what) { - auto semaphore = m_device->createSemaphore(0); + auto semaphore = m_device->createSemaphore(); // so we begin latching everything on the value of 1, but if we overflow it increases IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1}; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 96218c67e5..517485d08c 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -117,7 +117,6 @@ nbl_get_conf_dir(NABLA_CONF_DIR_RELEASE Release) nbl_get_conf_dir(NABLA_CONF_DIR_RELWITHDEBINFO RelWithDebInfo) if (NBL_COMPILE_WITH_CUDA) - message(STATUS "Building with CUDA interop") set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) if (NBL_BUILD_OPTIX) set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) @@ -314,6 +313,8 @@ set(NBL_VIDEO_SOURCES # CUDA ${NBL_ROOT_PATH}/src/nbl/video/CCUDAHandler.cpp ${NBL_ROOT_PATH}/src/nbl/video/CCUDADevice.cpp + ${NBL_ROOT_PATH}/src/nbl/video/CCUDASharedMemory.cpp + ${NBL_ROOT_PATH}/src/nbl/video/CCUDASharedSemaphore.cpp ) set(NBL_SCENE_SOURCES @@ -378,6 +379,10 @@ endif() target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__) +if (NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) +endif() + if (ANDROID) add_library(android_native_app_glue STATIC ${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp index 4ad0710dbf..6eb93d7242 100644 --- a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp @@ -96,6 +96,7 @@ void CPLYMeshFileLoader::initialize() auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); if (pipelineBundle.getContents().empty()) { +#if 0 // WHAT IS THIS? auto mbVertexShader = core::smart_refctd_ptr(); auto mbFragmentShader = core::smart_refctd_ptr(); { @@ -108,6 +109,7 @@ void CPLYMeshFileLoader::initialize() mbVertexShader = core::smart_refctd_ptr_static_cast(vertexShaderBundle->begin()->getContents().begin()[0]); mbFragmentShader = core::smart_refctd_ptr_static_cast(fragmentShaderBundle->begin()->getContents().begin()[0]); } +#endif auto mbPipelineLayout = defaultOverride.findDefaultAsset("nbl/builtin/pipeline_layout/loader/PLY", fakeContext, 0u).first; @@ -130,7 +132,7 @@ void CPLYMeshFileLoader::initialize() const auto currentBitmask = core::createBitmask({ attrib }); inputParams.enabledBindingFlags |= currentBitmask; inputParams.enabledAttribFlags |= currentBitmask; - inputParams.bindings[attrib] = { asset::getTexelOrBlockBytesize(static_cast(vertexAttribParamsAllOptions[attrib].format)), EVIR_PER_VERTEX }; + inputParams.bindings[attrib] = { asset::getTexelOrBlockBytesize(static_cast(vertexAttribParamsAllOptions[attrib].format)), SVertexInputBindingParams::EVIR_PER_VERTEX}; inputParams.attributes[attrib] = vertexAttribParamsAllOptions[attrib]; } @@ -143,14 +145,15 @@ void CPLYMeshFileLoader::initialize() SRasterizationParams rastarizationParmas; +#if 0 // WHAT IS THIS? auto mbPipeline = core::make_smart_refctd_ptr(std::move(mbPipelineLayout), nullptr, nullptr, inputParams, blendParams, primitiveAssemblyParams, rastarizationParmas); { mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get()); mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get()); - asset::SAssetBundle newPipelineBundle(nullptr, { core::smart_refctd_ptr(mbPipeline) }); defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); } +#endif } else return; diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp index c080857c63..b507153916 100644 --- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp @@ -52,6 +52,7 @@ void CSTLMeshFileLoader::initialize() auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); if (pipelineBundle.getContents().empty()) { +#if 0 // WHAT IS THIS? auto mbVertexShader = core::smart_refctd_ptr(); auto mbFragmentShader = core::smart_refctd_ptr(); { @@ -64,7 +65,7 @@ void CSTLMeshFileLoader::initialize() mbVertexShader = core::smart_refctd_ptr_static_cast(vertexShaderBundle->begin()->getContents().begin()[0]); mbFragmentShader = core::smart_refctd_ptr_static_cast(fragmentShaderBundle->begin()->getContents().begin()[0]); } - +#endif auto defaultOverride = IAssetLoaderOverride(m_assetMgr); const IAssetLoader::SAssetLoadContext fakeContext(IAssetLoader::SAssetLoadParams{}, nullptr); @@ -79,7 +80,7 @@ void CSTLMeshFileLoader::initialize() const auto stride = positionFormatByteSize + colorFormatByteSize + normalFormatByteSize; mbInputParams.enabledBindingFlags |= core::createBitmask({ 0 }); mbInputParams.enabledAttribFlags |= core::createBitmask({ POSITION_ATTRIBUTE, NORMAL_ATTRIBUTE, withColorAttribute ? COLOR_ATTRIBUTE : 0 }); - mbInputParams.bindings[0] = { stride, EVIR_PER_VERTEX }; + mbInputParams.bindings[0] = { stride, SVertexInputBindingParams::EVIR_PER_VERTEX }; mbInputParams.attributes[POSITION_ATTRIBUTE].format = EF_R32G32B32_SFLOAT; mbInputParams.attributes[POSITION_ATTRIBUTE].relativeOffset = 0; @@ -102,14 +103,15 @@ void CSTLMeshFileLoader::initialize() SRasterizationParams rastarizationParmas; +#if 0 // WHAT IS THIS? auto mbPipeline = core::make_smart_refctd_ptr(std::move(mbPipelineLayout), nullptr, nullptr, mbInputParams, blendParams, primitiveAssemblyParams, rastarizationParmas); { mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get()); mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get()); } - asset::SAssetBundle newPipelineBundle(nullptr, {core::smart_refctd_ptr(mbPipeline)}); defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW); +#endif } else return; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 4d2e880095..9fbb635f52 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -1,131 +1,159 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CCUDAHandler.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture) - : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture) +CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr&& _handler) + : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture), m_handle(_handle), m_handler(std::move(_handler)), m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); -} + auto& cu = m_handler->getCUDAFunctionTable(); + + CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle); + assert(CUDA_SUCCESS == re); + re = cu.pcuCtxSetCurrent(m_context); + assert(CUDA_SUCCESS == re); + + for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) + { + uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = {.type = static_cast(i), .id = m_handle }, + .win32HandleMetaData = metaData, + }; + auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + + assert(CUDA_SUCCESS == re); + } +} -#if 0 -CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) +CCUDADevice::~CCUDADevice() { - assert(link->obj); - auto glbuf = static_cast(link->obj.get()); - auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); - if (retval!=CUDA_SUCCESS) - link->obj = nullptr; - return retval; + m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); } -CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink* link, uint32_t flags) + +size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const { - assert(link->obj); - - auto format = link->obj->getCreationParameters().format; - if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format)) - return CUDA_ERROR_INVALID_IMAGE; - - auto glimg = static_cast(link->obj.get()); - GLenum target = glimg->getOpenGLTarget(); - switch (target) - { - case GL_TEXTURE_2D: - case GL_TEXTURE_2D_ARRAY: - case GL_TEXTURE_CUBE_MAP: - case GL_TEXTURE_3D: - break; - default: - return CUDA_ERROR_INVALID_IMAGE; - break; - } - auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags); - if (retval != CUDA_SUCCESS) - link->obj = nullptr; - return retval; + return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; } - -constexpr auto MaxAquireOps = 4096u; - -CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes) +CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { - if (linksBegin+MaxAquireOpsgetCUDAFunctionTable(); + + CUdeviceptr ptr = 0; + if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + return err; - size_t tmp = 0xdeadbeefbadc0ffeull; - size_t* sit = outbufferSizes; - for (auto iit=linksBegin; iit!=linksEnd; iit++,sit++) + if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; + cu.pcuMemAddressFree(ptr, size); + return err; + } + + CUmemAccessDesc accessDesc = { + .location = { .type = location, .id = m_handle }, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; + if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) + { + cu.pcuMemUnmap(ptr, size); + cu.pcuMemAddressFree(ptr, size); + return err; } + + *outPtr = ptr; + return CUDA_SUCCESS; } -CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) + +CUresult CCUDADevice::createSharedMemory( + core::smart_refctd_ptr* outMem, + CCUDASharedMemory::SCreationParams&& inParams) { - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; + auto& cu = m_handler->getCUDAFunctionTable(); - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; + cu.pcuMemRelease(mem); + return err; + } - result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit); - if (result != CUDA_SUCCESS) - return result; + if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + { + CloseHandle(params.osHandle); + cu.pcuMemRelease(mem); + return err; } + + if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + { + CloseHandle(params.osHandle); + return err; + } + + *outMem = core::smart_refctd_ptr(new CCUDASharedMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + return CUDA_SUCCESS; } -#endif + +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +{ + if (!sema || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = sema->getCreationParams().externalHandleTypes; + auto handle = sema->getCreationParams().externalHandle; + + if (!handleType.hasFlags(ISemaphore::EHT_OPAQUE_WIN32) || !handle) + return CUDA_ERROR_INVALID_VALUE; + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, + .handle = {.win32 = {.handle = handle }}, + }; + + CUexternalSemaphore cusema; + if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + return err; + + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, handle), core::dont_grab); + return CUDA_SUCCESS; +} + } -#endif // _NBL_COMPILE_WITH_CUDA_ +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 183afe6b43..2789bed2a6 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDAHandler.h" +#include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "jitify/jitify.hpp" @@ -10,7 +11,49 @@ namespace nbl::video { - + +CCUDAHandler::CCUDAHandler( + CUDA&& _cuda, + NVRTC&& _nvrtc, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_cuda(std::move(_cuda)) + , m_nvrtc(std::move(_nvrtc)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{ + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + return; + + for (int ordinal = 0; ordinal < deviceCount; ordinal++) + { + CUdevice handle = -1; + if (m_cuda.pcuDeviceGet(&handle, ordinal) != CUDA_SUCCESS || handle < 0) + continue; + + CUuuid uuid = {}; + if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS) + continue; + + m_availableDevices.emplace_back(handle, uuid); + + int* attributes = m_availableDevices.back().attributes; + for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) + m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + + } +} + bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) @@ -410,7 +453,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? - const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; + const char* nvrtc64_versions[] = { "nvrtc64_120", "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; for (auto verpath=nvrtc64_versions; *verpath; verpath++) { @@ -468,7 +511,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { const void* contents = it.second.data(); headers.push_back(core::make_smart_refctd_ptr>( - core::smart_refctd_ptr(system),it.first.c_str(), + it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, const_cast(contents),it.second.size()+1u )); @@ -514,7 +557,8 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; auto ptx = core::make_smart_refctd_ptr(_size); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; + nvrtcResult result = m_nvrtc.pnvrtcGetPTX(prog, reinterpret_cast(ptx->getPointer())); + return {std::move(ptx),result}; } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -525,112 +569,98 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) - return nullptr; - - for (int ordinal=0; ordinalgetLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid, &physicalDevice->getProperties().deviceUUID, VK_UUID_SIZE)) { - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - for (int i=0; i(i),handle); - CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; - const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; - const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; + const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; + const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; switch (archMajor) { - case 3: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_30; - break; - case 2: - arch = CCUDADevice::EVA_32; - break; - case 5: - arch = CCUDADevice::EVA_35; - break; - case 7: - arch = CCUDADevice::EVA_37; - break; - default: - break; - } + case 3: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_30; break; - case 5: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_50; - break; - case 2: - arch = CCUDADevice::EVA_52; - break; - case 3: - arch = CCUDADevice::EVA_53; - break; - default: - break; - } + case 2: + arch = CCUDADevice::EVA_32; break; - case 6: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_60; - break; - case 1: - arch = CCUDADevice::EVA_61; - break; - case 2: - arch = CCUDADevice::EVA_62; - break; - default: - break; - } + case 5: + arch = CCUDADevice::EVA_35; break; case 7: - switch (archMinor) - { - case 0: - arch = CCUDADevice::EVA_70; - break; - case 2: - arch = CCUDADevice::EVA_72; - break; - case 5: - arch = CCUDADevice::EVA_75; - break; - default: - break; - } + arch = CCUDADevice::EVA_37; + break; + default: + break; + } + break; + case 5: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_50; + break; + case 2: + arch = CCUDADevice::EVA_52; + break; + case 3: + arch = CCUDADevice::EVA_53; break; default: - if (archMajor>=8) - arch = CCUDADevice::EVA_80; break; + } + break; + case 6: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_60; + break; + case 1: + arch = CCUDADevice::EVA_61; + break; + case 2: + arch = CCUDADevice::EVA_62; + break; + default: + break; + } + break; + case 7: + switch (archMinor) + { + case 0: + arch = CCUDADevice::EVA_70; + break; + case 2: + arch = CCUDADevice::EVA_72; + break; + case 5: + arch = CCUDADevice::EVA_75; + break; + default: + break; + } + break; + default: + if (archMajor >= 8) + arch = CCUDADevice::EVA_80; + break; } - if (arch==CCUDADevice::EVA_COUNT) + if (arch == CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } - } + return core::smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)), core::dont_grab); + } + } + return nullptr; } } -#endif // _NBL_COMPILE_WITH_CUDA_ +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp new file mode 100644 index 0000000000..a5b8011920 --- /dev/null +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ + +core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +{ + auto pd = device->getPhysicalDevice(); + uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; + uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); + + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; + case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; + req.size = m_params.granularSize; + req.memoryTypeBits = memoryTypeBits; + req.prefersDedicatedAllocation = nullptr != dedication; + req.requiresDedicatedAllocation = nullptr != dedication; + + return device->allocate(req, + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, m_params.osHandle, + std::make_unique(core::smart_refctd_ptr(this))).memory; +} + +core::smart_refctd_ptr CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + auto img = device->createImage({ std::move(params), { {.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE } }, IGPUImage::TILING::LINEAR }); + + if (exportAsMemory(device, img.get())) + return img; + + return nullptr; +} + +CCUDASharedMemory::~CCUDASharedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + + CUresult re[] = { + cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), + }; + CloseHandle(m_params.osHandle); + +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp new file mode 100644 index 0000000000..4d6d3aacc9 --- /dev/null +++ b/src/nbl/video/CCUDASharedSemaphore.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDASharedSemaphore::~CCUDASharedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + cu.pcuDestroyExternalSemaphore(m_handle); + CloseHandle(m_osHandle); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanBuffer.cpp b/src/nbl/video/CVulkanBuffer.cpp index c24e13ceb8..2e542944d3 100644 --- a/src/nbl/video/CVulkanBuffer.cpp +++ b/src/nbl/video/CVulkanBuffer.cpp @@ -8,7 +8,7 @@ namespace nbl::video CVulkanBuffer::~CVulkanBuffer() { preDestroyStep(); - if (m_cachedCreationParams.skipHandleDestroy) + if (!m_cachedCreationParams.skipHandleDestroy) { const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); auto* vk = vulkanDevice->getFunctionTable(); diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h index 4596981c2a..988d50c2ec 100644 --- a/src/nbl/video/CVulkanBuffer.h +++ b/src/nbl/video/CVulkanBuffer.h @@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked using base_t = CVulkanDeviceMemoryBacked; public: - inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {} + inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev,std::move(creationParams), dedicatedOnly, buffer) {} void setObjectDebugName(const char* label) const override; diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index 2b1f9d9070..64ec5f68c0 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -48,25 +48,41 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkBufferMemoryBarrier2-buffer-04088 if (concurrentSharing) selfQueueFamilyIndex = IQueue::FamilyIgnored; + + auto mapQFIdx = [](uint32_t idx) + { + switch (idx) + { + case IQueue::FamilyExternal: + case IQueue::FamilyIgnored: + case IQueue::FamilyForeign: + idx |= 1u << 31; + break; + } + return idx; + }; + if constexpr (!std::is_same_v) { - out.srcQueueFamilyIndex = selfQueueFamilyIndex; - out.dstQueueFamilyIndex = selfQueueFamilyIndex; + out.srcQueueFamilyIndex = mapQFIdx(selfQueueFamilyIndex); + out.dstQueueFamilyIndex = mapQFIdx(selfQueueFamilyIndex); } const asset::SMemoryBarrier* memoryBarrier; if constexpr (std::is_same_v) { memoryBarrier = &in.dep; // in.otherQueueFamilyIndex==selfQueueFamilyIndex not resulting in ownership transfer is implicit - if (!concurrentSharing && in.otherQueueFamilyIndex!=IQueue::FamilyIgnored) - switch (in.ownershipOp) + if (!concurrentSharing && in.otherQueueFamilyIndex != IQueue::FamilyIgnored) { + switch (in.ownershipOp) + { case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE: - out.dstQueueFamilyIndex = in.otherQueueFamilyIndex; + out.dstQueueFamilyIndex = mapQFIdx(in.otherQueueFamilyIndex); break; case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE: - out.srcQueueFamilyIndex = in.otherQueueFamilyIndex; + out.srcQueueFamilyIndex = mapQFIdx(in.otherQueueFamilyIndex); break; + } } } else diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 2bec9e9d06..8f08f9aa67 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -6,7 +6,7 @@ namespace nbl::video { template -IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle) +IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle) { const std::conditional_t vk_memoryRequirementsInfo = { IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked(getOriginDevice()); diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 91d158b0ea..607aa69caa 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -45,21 +45,74 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams&& params) { +#ifdef _WIN32 + VkImportSemaphoreWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(params.externalHandleTypes.value), + .handle = params.externalHandle, + }; + VkExportSemaphoreWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL + }; +#else + VkImportSemaphoreFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR, + .handleType = static_cast(params.externalHandleTypes.value), + .fd = params.externalHandle, + }; +#endif + + VkExportSemaphoreCreateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(params.externalHandleTypes.value) + }; + + + const bool importing = params.externalHandleTypes.value && params.externalHandle; + const bool exporting = params.externalHandleTypes.value && !params.externalHandle; + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = exporting ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; type.initialValue = initialValue; - VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,&type }; + VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, &type }; createInfo.flags = static_cast(0); // flags must be 0 VkSemaphore semaphore; - if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS) - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this),semaphore); - else + if (VK_SUCCESS != m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore)) return nullptr; + + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = static_cast(params.externalHandleTypes.value), + }; + +#ifdef _WIN32 + auto importfn = m_devf.vk.vkImportSemaphoreWin32HandleKHR; + auto exportfn = m_devf.vk.vkGetSemaphoreWin32HandleKHR; +#else + auto importfn = m_devf.vk.vkImportSemaphoreFdKHR; + auto exportfn = m_devf.vk.vkGetSemaphoreFdKHR; +#endif + + if ( + (importing && (VK_SUCCESS != importfn(m_vkdev, &importInfo))) || + (exporting && (VK_SUCCESS != exportfn(m_vkdev, &props, ¶ms.externalHandle))) + ) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + return nullptr; + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), semaphore, std::move(params)); } ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) { @@ -125,26 +178,89 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } +void* DupeHandle(uint64_t pid, void* handle) +{ +#ifdef _WIN32 + DWORD flags; + HANDLE re = 0; + + HANDLE cur = GetCurrentProcess(); + HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur; + + if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return 0; + + CloseHandle(src); + return re; +#endif + return handle; +} IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { - IDeviceMemoryAllocator::SAllocation ret = {}; if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) - return ret; + return {}; - const core::bitflag allocateFlags(info.flags); VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { - if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) + if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; + VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo }; + vk_allocateInfo.allocationSize = info.allocationSize; + vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; + +#ifdef _WIN32 + VkImportMemoryWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .handle = info.externalHandle + }; + + VkExportMemoryWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL, + }; +#else + VkImportMemoryFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .fd = (int)info.externalHandle, + }; +#endif + + VkExportMemoryAllocateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(info.externalHandleType), + }; + + const void** pNext = &vk_allocateFlagsInfo.pNext; + + if (info.externalHandleType) + { + if (info.externalHandle) //importing + { + auto duped = DupeHandle(0, info.externalHandle); + const_cast(info.externalHandle) = duped; + *pNext = &importInfo; + } + else // exporting + *pNext = &exportInfo; + pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; + } + if(info.dedication) { // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0)); - vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo; + *pNext = &vk_dedicatedInfo; + pNext = &vk_dedicatedInfo.pNext; + switch (info.dedication->getObjectType()) { case IDeviceMemoryBacked::EOT_BUFFER: @@ -155,23 +271,65 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; default: assert(false); - return ret; + return {}; break; } } - VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo}; - vk_allocateInfo.allocationSize = info.size; - vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); if (vk_res!=VK_SUCCESS) - return ret; + return {}; + + const bool exported = info.externalHandleType && !info.externalHandle; + + if (exported) + { +#ifdef _WIN32 + VkMemoryGetWin32HandleInfoKHR +#else + VkMemoryGetFdInfoKHR +#endif + handleInfo = { .sType = +#ifdef _WIN32 + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#else + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#endif + .memory = vk_deviceMemory, + .handleType = static_cast(info.externalHandleType), + }; + + /* + For handle types defined as NT handles, + the handles returned by vkGetMemoryWin32HandleKHR are owned by the application + and hold a reference to their payload. To avoid leaking resources, + the application must release ownership of them + using the CloseHandle system call when they are no longer needed. + */ + + if (VK_SUCCESS != m_devf.vk. +#ifdef _WIN32 + vkGetMemoryWin32HandleKHR +#else + vkGetMemoryFdKHR +#endif + (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + { + m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); + return {}; + } + + } // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - ret.memory = core::make_smart_refctd_ptr(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory); + + CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; + IDeviceMemoryAllocator::SAllocation ret = {}; + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator + if(info.dedication) { bool dedicationSuccess = false; @@ -288,22 +446,29 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin } -core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) +core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; - // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR - vk_createInfo.pNext = nullptr; + // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkBufferDeviceAddressCreateInfoEXT, VkBufferOpaqueCaptureAddressCreateInfo, VkDedicatedAllocationBufferCreateInfoNV, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR + + VkExternalMemoryBufferCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO, + .handleTypes = creationParams.externalHandleTypes.value, + }; + + vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); - vk_createInfo.sharingMode = creationParams.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; + vk_createInfo.sharingMode = creationParams.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = creationParams.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = creationParams.queueFamilyIndices; + VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(creationParams),vk_buffer); + return core::make_smart_refctd_ptr(this,std::move(creationParams), dedicatedOnly, vk_buffer); } core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) @@ -322,19 +487,24 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im return nullptr; } -core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) { - VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr }; - vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value,true); + VkExternalMemoryImageCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = params.externalHandleTypes.value, + }; + + VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, &externalMemoryInfo }; + vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value, true); - std::array vk_formatList; + std::array vk_formatList; VkImageFormatListCreateInfo vk_formatListStruct = { VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO, &vk_stencilUsage }; vk_formatListStruct.viewFormatCount = 0u; // if only there existed a nice iterator that would let me iterate over set bits 64 faster if (params.viewFormats.any()) - for (auto fmt=0; fmt(fmt)); + for (auto fmt = 0; fmt < vk_formatList.size(); fmt++) + if (params.viewFormats.test(fmt)) + vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast(fmt)); vk_formatListStruct.pViewFormats = vk_formatList.data(); VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct }; @@ -346,16 +516,17 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.arrayLayers = params.arrayLayers; vk_createInfo.samples = static_cast(params.samples); vk_createInfo.tiling = static_cast(params.tiling); - vk_createInfo.usage = getVkImageUsageFlagsFromImageUsageFlags(params.usage.value,asset::isDepthOrStencilFormat(params.format)); - vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; + vk_createInfo.usage = getVkImageUsageFlagsFromImageUsageFlags(params.usage.value, asset::isDepthOrStencilFormat(params.format)); + vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; - vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED; + vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED; + VkImage vk_image; - if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) + if (m_devf.vk.vkCreateImage(m_vkdev, &vk_createInfo, nullptr, &vk_image) != VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(params),vk_image); + return core::make_smart_refctd_ptr(this, std::move(params), dedicatedOnly, vk_image); } core::smart_refctd_ptr CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params) diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 65489d9c53..f18fb3dad4 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -52,7 +52,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice return CVulkanQueue::getResultFrom(m_devf.vk.vkDeviceWaitIdle(m_vkdev)); } - core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams &&) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; @@ -103,9 +103,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override; // descriptor creation - core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override; + core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override; core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) override; - core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) override; + core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override; core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) override; VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr); inline core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 5a4dfd5ff5..7597e33717 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -4,14 +4,24 @@ namespace nbl::video { CVulkanMemoryAllocation::CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle -) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params +) + : IDeviceMemoryAllocation(dev,std::move(params)) + , m_vulkanDevice(dev) + , m_deviceMemoryHandle(deviceMemoryHandle) +{ +} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { + if (m_params.externalHandle) + { + bool re = CloseHandle(getCreationParams().externalHandle); + assert(re); + } + m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 470e914ae3..d9508411b0 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -15,10 +15,9 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation { public: CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index c6304ec836..62dcde7d42 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1204,6 +1204,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart if (isExtensionSupported(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME)) properties.limits.cooperativeMatrixRobustness = cooperativeMatrixFeatures.robustness; #endif + } // we compare all limits against the defaults easily! diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index c1552c88f1..9cfebccd3f 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,6 +109,79 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } + inline static SExternalMemoryProperties mapExternalMemoryProps(VkExternalMemoryProperties const& props) + { + return { + .exportableTypes = props.exportFromImportedHandleTypes, + .compatibleTypes = props.compatibleHandleTypes, + .dedicatedOnly = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT ? 1u : 0u, + .exportable = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT ? 1u : 0u, + .importable = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT ? 1u : 0u, + }; + } + + SExternalMemoryProperties getExternalBufferProperties_impl(core::bitflag usage, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override + { + assert(!(handleType & (handleType - 1))); + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO, + .usage = static_cast(usage.value), + .handleType = static_cast(handleType) + }; + VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES }; + vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps); + return mapExternalMemoryProps(externalProps.externalMemoryProperties); + } + + SExternalImageFormatProperties getExternalImageProperties_impl( + asset::E_FORMAT format, + IGPUImage::TILING tiling, + IGPUImage::E_TYPE type, + core::bitflag usage, + core::bitflag flags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override + { + assert(!(handleType & (handleType - 1))); + + VkPhysicalDeviceExternalImageFormatInfo extInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, + .handleType = static_cast(handleType), + }; + + VkPhysicalDeviceImageFormatInfo2 info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = &extInfo, + .format = getVkFormatFromFormat(format), + .type = static_cast(type), + .tiling = static_cast(tiling), + .usage = usage.value, + .flags = flags.value, + }; + + VkExternalImageFormatProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES }; + + VkImageFormatProperties2 props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = &externalProps, + }; + + VkResult re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props); + if(VK_SUCCESS != re) + return {}; + + return + { + { + .maxExtent = props.imageFormatProperties.maxExtent, + .maxMipLevels = props.imageFormatProperties.maxMipLevels, + .maxArrayLayers = props.imageFormatProperties.maxArrayLayers, + .sampleCounts = static_cast(props.imageFormatProperties.sampleCounts), + .maxResourceSize = props.imageFormatProperties.maxResourceSize, + }, + mapExternalMemoryProps(externalProps.externalMemoryProperties) + }; + } + core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; private: diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 9290110d8d..2beb7cb21b 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,11 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& dev, const VkSemaphore semaphore, SCreationParams&& params = {}) + : ISemaphore(std::move(dev), std::move(params)) + , m_semaphore(semaphore) + {} + ~CVulkanSemaphore(); uint64_t getCounterValue() const override; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 97030ccbba..2902ff7509 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -111,7 +111,7 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag return getSupportedStageMask(queueFamilyIndex).hasFlags(stageMask); } -bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag stageMask) const +bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag accesMask) const { if (queueFamilyIndex>m_queueFamilyInfos->size()) return false; @@ -119,15 +119,15 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag const auto& familyProps = m_physicalDevice->getQueueFamilyProperties()[queueFamilyIndex].queueFlags; const bool shaderCapableFamily = bool(familyProps&(q_family_flags_t::COMPUTE_BIT|q_family_flags_t::GRAPHICS_BIT)); // strip special values - if (stageMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS)) - stageMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS; - else if (stageMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily) - stageMask ^= asset::ACCESS_FLAGS::SHADER_READ_BITS; - if (stageMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_WRITE_BITS)) - stageMask ^= asset::ACCESS_FLAGS::MEMORY_WRITE_BITS; - else if (stageMask.hasFlags(asset::ACCESS_FLAGS::SHADER_WRITE_BITS) && shaderCapableFamily) - stageMask ^= asset::ACCESS_FLAGS::SHADER_WRITE_BITS; - return getSupportedAccessMask(queueFamilyIndex).hasFlags(stageMask); + if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS)) + accesMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS; + else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily) + accesMask ^= asset::ACCESS_FLAGS::SHADER_READ_BITS; + if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_WRITE_BITS)) + accesMask ^= asset::ACCESS_FLAGS::MEMORY_WRITE_BITS; + else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_WRITE_BITS) && shaderCapableFamily) + accesMask ^= asset::ACCESS_FLAGS::SHADER_WRITE_BITS; + return getSupportedAccessMask(queueFamilyIndex).hasFlags(accesMask); } bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const @@ -647,4 +647,74 @@ bool ILogicalDevice::createGraphicsPipelines( if (!output[i]) return false; return true; +} + +core::smart_refctd_ptr ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams) +{ + const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; + if (creationParams.size > maxSize) + { + m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) + { + const auto handleType = static_cast(1u << (idx - 1)); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types + return nullptr; + + dedicatedOnly |= props.dedicatedOnly; + } + } + return createBuffer_impl(std::move(creationParams), dedicatedOnly); +} + +core::smart_refctd_ptr ILogicalDevice::createImage(IGPUImage::SCreationParams&& params) +{ + if (!IGPUImage::validateCreationParameters(params)) + { + m_logger.log("Failed to create Image, invalid creation parameters!", system::ILogger::ELL_ERROR); + return nullptr; + } + + const bool external = params.externalHandleTypes.value; + bool dedicatedOnly = false; + if (external) + { + core::bitflag requestedTypes = params.externalHandleTypes; + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) + 1) + { + const auto handleType = static_cast(1u << (idx - 1)); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType); + + if (props.maxArrayLayers < params.arrayLayers || + !core::bitflag(props.sampleCounts).hasFlags(params.samples) || + /* props.maxResourceSize?? */ + props.maxExtent.width < params.extent.width || + props.maxExtent.height < params.extent.height || + props.maxExtent.depth < params.extent.depth) + { + return nullptr; + } + + if (!core::bitflag(static_cast(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types + return nullptr; + + dedicatedOnly |= props.dedicatedOnly; + } + } + // TODO: @Cyprian validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage + return createImage_impl(std::move(params), dedicatedOnly); } \ No newline at end of file diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index e75e7b2cad..2527562bac 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -13,8 +13,15 @@ auto IQueue::submit(const std::span _submits) -> RESULT auto* logger = m_originDevice->getPhysicalDevice()->getDebugCallback()->getLogger(); for (const auto& submit : _submits) { - if (!submit.valid()) + switch (submit.valid()) + { + case SSubmitInfo::INVALID: return RESULT::OTHER_ERROR; + case SSubmitInfo::WORK_WITHOUT_SYNC: + logger->log("Work withouth sync!", system::ILogger::ELL_WARNING); + default: + break; + } auto invalidSemaphores = [this,logger](const std::span semaphoreInfos) -> bool {