From d293b9b85589ba6e483bc2e7bdabe9c563793190 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 8 Jul 2023 18:29:58 +0300
Subject: [PATCH 01/62] create exportable buffers to import into cuda

---
 include/nbl/video/CCUDAHandler.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 01774b25d2..fe1ba28204 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -119,6 +119,7 @@ class CCUDAHandler : public core::IReferenceCounted
 			,cuSurfObjectDestroy
 			,cuTexObjectCreate
 			,cuTexObjectDestroy
+			,cuImportExternalMemory
 		);
 		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
 

From f5f1017b876c9c8beb5c1c2fa43749977c68a2ba Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 16:10:04 +0300
Subject: [PATCH 02/62] add missing cuda fn and update submodule

---
 examples_tests                   | 2 +-
 include/nbl/video/CCUDADevice.h  | 2 +-
 include/nbl/video/CCUDAHandler.h | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples_tests b/examples_tests
index 31f501f9b1..faddda46b2 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 31f501f9b1624457eaf4a71eececa1fb67172ca3
+Subproject commit faddda46b285b433c2926b384064bd80a6889b43
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 1120224fdb..ceb8ec3a8d 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -179,7 +179,7 @@ class CCUDADevice : public core::IReferenceCounted
 		static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, CUstream stream);
 		static CUresult acquireAndGetArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream);
 #endif
-
+		CUdevice getInternalObject() const { return m_handle; }
 	protected:
 		friend class CCUDAHandler;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture);
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index fe1ba28204..fb3d52fc0f 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -120,6 +120,8 @@ class CCUDAHandler : public core::IReferenceCounted
 			,cuTexObjectCreate
 			,cuTexObjectDestroy
 			,cuImportExternalMemory
+			,cuDestroyExternalMemory
+			,cuExternalMemoryGetMappedBuffer
 		);
 		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
 

From 6689b335623771a309904de44f27556476c91978 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 20:56:00 +0300
Subject: [PATCH 03/62] add missing cuda export functions

---
 include/nbl/video/CCUDAHandler.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index fb3d52fc0f..838c527567 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -122,6 +122,15 @@ class CCUDAHandler : public core::IReferenceCounted
 			,cuImportExternalMemory
 			,cuDestroyExternalMemory
 			,cuExternalMemoryGetMappedBuffer
+			,cuMemUnmap
+			,cuMemAddressFree
+			,cuMemGetAllocationGranularity
+			,cuMemAddressReserve
+			,cuMemCreate
+			,cuMemExportToShareableHandle
+			,cuMemMap
+			,cuMemRelease
+			,cuMemSetAccess
 		);
 		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
 

From 9ade1c66917d1c34d072623e62c50c2dfb6f3b75 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 23:34:16 +0300
Subject: [PATCH 04/62] move boilerplates to CCUDADevice

---
 include/nbl/video/CCUDADevice.h         |  31 +++++
 include/nbl/video/CCUDAHandler.h        |   1 +
 include/nbl/video/IDeviceMemoryBacked.h |  25 ++++
 src/nbl/video/CCUDADevice.cpp           | 145 ++++++++++++++++++++++++
 4 files changed, 202 insertions(+)

diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index ceb8ec3a8d..fd39f8ec57 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -180,7 +180,38 @@ class CCUDADevice : public core::IReferenceCounted
 		static CUresult acquireAndGetArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream);
 #endif
 		CUdevice getInternalObject() const { return m_handle; }
+		const CCUDAHandler* getHandler() const { return m_handler.get();  }
+
+		struct SSharedCUDAMemory
+		{
+			size_t size;
+			CUdeviceptr ptr;
+			CUmemGenericAllocationHandle memory;
+			void* osHandle;
+		};
+
+		core::smart_refctd_ptr<IGPUBuffer> exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device);
+
+		CUresult importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr);
+		CUresult createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem);
+		CUresult releaseExportableMemory(SSharedCUDAMemory mem);
 	protected:
+
+		struct SCUDACleaner : video::ICleanup, SSharedCUDAMemory
+		{
+			core::smart_refctd_ptr<video::CCUDADevice> dev;
+			SCUDACleaner(SSharedCUDAMemory mem, core::smart_refctd_ptr<video::CCUDADevice>&& dev)
+				: SSharedCUDAMemory{ mem }
+				, dev(std::move(dev))
+			{ }
+
+			~SCUDACleaner()
+			{
+				dev->releaseExportableMemory(*this);
+			}
+		};
+
+		CUresult reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr);
 		friend class CCUDAHandler;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture);
 		~CCUDADevice() = default;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 838c527567..5341563ea0 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -131,6 +131,7 @@ class CCUDAHandler : public core::IReferenceCounted
 			,cuMemMap
 			,cuMemRelease
 			,cuMemSetAccess
+			,cuMemImportFromShareableHandle
 		);
 		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
 
diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index f2b449557c..24b9b79439 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -93,6 +93,31 @@ class IDeviceMemoryBacked : public IBackendObject
             const uint32_t* queueFamilyIndices = nullptr;
         };
 
+        void chainPreDestroyCleanup(std::unique_ptr<ICleanup> next)
+        {
+            if (!m_cachedCreationParams.preDestroyCleanup)
+            {
+                m_cachedCreationParams.preDestroyCleanup = std::move(next);
+                return;
+            }
+
+            struct SChainedCleanup : ICleanup
+            {
+                std::unique_ptr<ICleanup> first, next;
+                SChainedCleanup(std::unique_ptr<ICleanup>&& first, std::unique_ptr<ICleanup>&& next)
+                    : first(std::move(first))
+                    , next(std::move(next)) 
+                { }
+                ~SChainedCleanup()
+                {
+                    first = nullptr;
+                    next = nullptr;
+                }
+            };
+
+            m_cachedCreationParams.preDestroyCleanup = std::make_unique<SChainedCleanup>(std::move(m_cachedCreationParams.preDestroyCleanup), std::move(next));
+        }
+
     protected:
         inline IDeviceMemoryBacked(core::smart_refctd_ptr<const ILogicalDevice>&& originDevice, SCreationParams&& creationParams, const SDeviceMemoryRequirements& reqs)
             : IBackendObject(std::move(originDevice)), m_cachedCreationParams(std::move(creationParams)), m_cachedMemoryReqs(reqs) {}
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 4d2e880095..565621b00a 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -17,6 +17,151 @@ CCUDADevice::CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConn
 }
 
 
+CUresult CCUDADevice::reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr)
+{
+	auto& cu = m_handler->getCUDAFunctionTable();
+	
+	CUdeviceptr ptr = 0;
+	if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
+	{
+		return err;
+	}
+
+	if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
+	{
+		cu.pcuMemAddressFree(ptr, size);
+		return err;
+	}
+
+	CUmemAccessDesc accessDesc = {
+		.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle },
+		.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
+	};
+
+	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
+	{
+		cu.pcuMemUnmap(ptr, size);
+		cu.pcuMemAddressFree(ptr, size);
+		return err;
+	}
+
+	*outPtr = ptr;
+
+	return CUDA_SUCCESS;
+}
+
+CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory mem)
+{
+	auto& cu = m_handler->getCUDAFunctionTable();
+	if (auto err = cu.pcuMemUnmap(mem.ptr, mem.size); CUDA_SUCCESS != err) return err;
+	if (auto err = cu.pcuMemAddressFree(mem.ptr, mem.size); CUDA_SUCCESS != err) return err;
+	if (auto err = cu.pcuMemRelease(mem.memory); CUDA_SUCCESS != err) return err;
+	CloseHandle(mem.osHandle);
+}
+
+CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem)
+{
+	if (!outMem)
+		return CUDA_ERROR_INVALID_VALUE;
+
+	auto& cu = m_handler->getCUDAFunctionTable();
+
+	uint32_t metaData[16] = { 48 };
+	CUmemAllocationProp prop = {
+		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
+		.requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32,
+		.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle },
+		.win32HandleMetaData = metaData,
+	};
+
+	size_t granularity = 0;
+	if (auto err = cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); CUDA_SUCCESS != err)
+		return err;
+
+	size = ((size - 1) / granularity + 1) * granularity;
+	
+	CUmemGenericAllocationHandle mem = 0;
+	void* handle = 0;
+	CUdeviceptr ptr = 0;
+
+	if(auto err = cu.pcuMemCreate(&mem, size, &prop, 0); CUDA_SUCCESS != err)
+		return err;
+
+	if (auto err = cu.pcuMemExportToShareableHandle(&handle, mem, CU_MEM_HANDLE_TYPE_WIN32, 0); CUDA_SUCCESS != err)
+	{
+		cu.pcuMemRelease(mem);
+		return err;
+	}
+
+	if (auto err = reserveAdrressAndMapMemory(size, alignment, mem, &ptr); CUDA_SUCCESS != err)
+	{
+		CloseHandle(handle);
+		cu.pcuMemRelease(mem);
+		return err;
+	}
+
+	outMem->size = size;
+	outMem->memory = mem;
+	outMem->ptr = ptr;
+	outMem->osHandle = handle;
+	return CUDA_SUCCESS;
+}
+
+core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device)
+{
+	auto buf = device->createBuffer(
+		{ {.size = mem.size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT },
+		{ {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}} });
+	
+	auto req = buf->getMemoryReqs();
+	req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+	auto allocation = device->allocate(req, buf.get());
+
+	if (!(allocation.memory && allocation.offset != ILogicalDevice::InvalidMemoryOffset))
+		return nullptr;
+
+	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(mem, core::smart_refctd_ptr<video::CCUDADevice>(this)));
+	return buf;
+}
+
+CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr)
+{
+	auto& params = buf->getCachedCreationParams();
+
+	if (!params.externalMemoryHandType.value || !outPtr)
+		return CUDA_ERROR_INVALID_VALUE;
+
+	CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
+		.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
+		.handle = {.win32 = {.handle = buf->getExternalHandle()}},
+		.size = buf->getMemoryReqs().size,
+	};
+
+	CUmemGenericAllocationHandle mem = 0;
+	CUdeviceptr ptr = 0;
+	void* handle = handleDesc.handle.win32.handle;
+
+	auto& cu = m_handler->getCUDAFunctionTable();
+	if (auto err = cu.pcuMemImportFromShareableHandle(&mem, buf->getExternalHandle(),
+		static_cast<CUmemAllocationHandleType>(params.externalMemoryHandType.value)); 
+		CUDA_SUCCESS != err)
+		return err;
+
+	if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1 << buf->getMemoryReqs().alignmentLog2, mem, &ptr))
+	{
+		cu.pcuMemRelease(mem);
+		return err;
+	}
+
+	outPtr->ptr = ptr;
+	outPtr->memory = mem;
+	outPtr->size = buf->getSize();
+	outPtr->osHandle = handle;
+
+	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(*outPtr, core::smart_refctd_ptr<video::CCUDADevice>(this)));
+	return CUDA_SUCCESS;
+}
+
 #if 0
 CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink<video::IGPUBuffer>* link, uint32_t flags)
 {

From bfa7afc5d01357f0f20cf904ab345100ba00631f Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 15 Jul 2023 13:38:16 +0300
Subject: [PATCH 05/62] correct chained cleanup desctruction order

---
 include/nbl/video/IDeviceMemoryBacked.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index 24b9b79439..4d44131e7c 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -93,11 +93,11 @@ class IDeviceMemoryBacked : public IBackendObject
             const uint32_t* queueFamilyIndices = nullptr;
         };
 
-        void chainPreDestroyCleanup(std::unique_ptr<ICleanup> next)
+        void chainPreDestroyCleanup(std::unique_ptr<ICleanup> first)
         {
             if (!m_cachedCreationParams.preDestroyCleanup)
             {
-                m_cachedCreationParams.preDestroyCleanup = std::move(next);
+                m_cachedCreationParams.preDestroyCleanup = std::move(first);
                 return;
             }
 
@@ -115,7 +115,7 @@ class IDeviceMemoryBacked : public IBackendObject
                 }
             };
 
-            m_cachedCreationParams.preDestroyCleanup = std::make_unique<SChainedCleanup>(std::move(m_cachedCreationParams.preDestroyCleanup), std::move(next));
+            m_cachedCreationParams.preDestroyCleanup = std::make_unique<SChainedCleanup>(std::move(first), std::move(m_cachedCreationParams.preDestroyCleanup));
         }
 
     protected:

From ddb861edf0c32c02e2a00e3d0628c9f6b4b1938f Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 15 Jul 2023 13:59:32 +0300
Subject: [PATCH 06/62] add safety checks

---
 include/nbl/video/CCUDADevice.h         |  2 +-
 include/nbl/video/IDeviceMemoryBacked.h |  2 +-
 src/nbl/video/CCUDADevice.cpp           | 23 ++++++++++++++++++++---
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index fd39f8ec57..183fb577be 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -205,7 +205,7 @@ class CCUDADevice : public core::IReferenceCounted
 				, dev(std::move(dev))
 			{ }
 
-			~SCUDACleaner()
+			~SCUDACleaner() override
 			{
 				dev->releaseExportableMemory(*this);
 			}
diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index 4d44131e7c..ef9bef6588 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -92,7 +92,7 @@ class IDeviceMemoryBacked : public IBackendObject
         {
             const uint32_t* queueFamilyIndices = nullptr;
         };
-
+        
         void chainPreDestroyCleanup(std::unique_ptr<ICleanup> first)
         {
             if (!m_cachedCreationParams.preDestroyCleanup)
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 565621b00a..cb6dc31728 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -109,9 +109,23 @@ CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSha
 
 core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device)
 {
+
+	if (!device || !mem.memory || !mem.osHandle || !mem.ptr || !mem.size)
+		return nullptr;
+
+	{
+		CUuuid id;
+		// TODO(Atil): Cache properties
+		if (CUDA_SUCCESS != m_handler->getCUDAFunctionTable().pcuDeviceGetUuid(&id, m_handle))
+			return nullptr;
+
+		if (memcmp(&id, device->getPhysicalDevice()->getProperties().deviceUUID, 16))
+			return nullptr;
+	}
+
 	auto buf = device->createBuffer(
 		{ {.size = mem.size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT },
-		{ {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}} });
+		{ {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}}});
 	
 	auto req = buf->getMemoryReqs();
 	req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
@@ -126,9 +140,12 @@ core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemor
 
 CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr)
 {
+	if (!buf || !outPtr)
+		return CUDA_ERROR_INVALID_VALUE;
+
 	auto& params = buf->getCachedCreationParams();
 
-	if (!params.externalMemoryHandType.value || !outPtr)
+	if (!params.externalMemoryHandType.value)
 		return CUDA_ERROR_INVALID_VALUE;
 
 	CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
@@ -147,7 +164,7 @@ CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr
 		CUDA_SUCCESS != err)
 		return err;
 
-	if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1 << buf->getMemoryReqs().alignmentLog2, mem, &ptr))
+	if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem, &ptr))
 	{
 		cu.pcuMemRelease(mem);
 		return err;

From f3803982fa6d4e102c9aa358fabccaf34c281c45 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 15 Jul 2023 23:20:29 +0300
Subject: [PATCH 07/62] semaphore interop

---
 include/nbl/video/CCUDADevice.h         |  49 ++++++++---
 include/nbl/video/CCUDAHandler.h        |   4 +
 include/nbl/video/IDeviceMemoryBacked.h |  33 +++-----
 src/nbl/video/CCUDADevice.cpp           | 106 +++++++++++++++++-------
 src/nbl/video/CVulkanBuffer.cpp         |   2 +-
 5 files changed, 130 insertions(+), 64 deletions(-)

diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 183fb577be..26005fcba3 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -182,36 +182,59 @@ class CCUDADevice : public core::IReferenceCounted
 		CUdevice getInternalObject() const { return m_handle; }
 		const CCUDAHandler* getHandler() const { return m_handler.get();  }
 
-		struct SSharedCUDAMemory
+		struct SSharedCUDAMemory : core::IReferenceCounted
 		{
+			core::smart_refctd_ptr<CCUDADevice> device;
 			size_t size;
 			CUdeviceptr ptr;
 			CUmemGenericAllocationHandle memory;
 			void* osHandle;
+			SSharedCUDAMemory(core::smart_refctd_ptr<CCUDADevice> device, size_t size, CUdeviceptr ptr, CUmemGenericAllocationHandle memory, void* osHandle)
+				: device(std::move(device))
+				, size(size)
+				, ptr(ptr)
+				, memory(memory)
+				, osHandle(osHandle)
+			{}
+			~SSharedCUDAMemory() override;
 		};
 
-		core::smart_refctd_ptr<IGPUBuffer> exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device);
+		struct SExternalCUDASemaphore : core::IReferenceCounted
+		{
+			core::smart_refctd_ptr<CCUDADevice> device;
+			CUexternalSemaphore semaphore;
+			void* osHandle;
+			SExternalCUDASemaphore(core::smart_refctd_ptr<CCUDADevice> device, CUexternalSemaphore semaphore, void* osHandle)
+				: device(std::move(device))
+				, semaphore(semaphore)
+				, osHandle(osHandle)
+			{}
+			~SExternalCUDASemaphore() override;
+		};
 
-		CUresult importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr);
-		CUresult createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem);
-		CUresult releaseExportableMemory(SSharedCUDAMemory mem);
+		core::smart_refctd_ptr<IGPUBuffer> exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device);
+		CUresult importGPUBuffer(core::smart_refctd_ptr<SSharedCUDAMemory>* outPtr, IGPUBuffer* buf);
+		CUresult importGPUSemaphore(core::smart_refctd_ptr<SExternalCUDASemaphore>* outPtr, IGPUSemaphore* sem);
+		CUresult createExportableMemory(core::smart_refctd_ptr<SSharedCUDAMemory>* outMem, size_t size, size_t alignment);
+		
 	protected:
+		friend struct SSharedCUDAMemory;
+		CUresult releaseExportableMemory(SSharedCUDAMemory* mem);
+		CUresult destroyExternalSemaphore(SExternalCUDASemaphore* sema);
 
-		struct SCUDACleaner : video::ICleanup, SSharedCUDAMemory
+		struct SCUDACleaner : video::ICleanup
 		{
-			core::smart_refctd_ptr<video::CCUDADevice> dev;
-			SCUDACleaner(SSharedCUDAMemory mem, core::smart_refctd_ptr<video::CCUDADevice>&& dev)
-				: SSharedCUDAMemory{ mem }
-				, dev(std::move(dev))
+			core::smart_refctd_ptr<core::IReferenceCounted> resource;
+			SCUDACleaner(core::smart_refctd_ptr<core::IReferenceCounted> resource)
+				: resource(std::move(resource))
 			{ }
-
 			~SCUDACleaner() override
 			{
-				dev->releaseExportableMemory(*this);
+				resource = nullptr;
 			}
 		};
 
-		CUresult reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr);
+		CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory);
 		friend class CCUDAHandler;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture);
 		~CCUDADevice() = default;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 5341563ea0..b6f98385bb 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -132,6 +132,10 @@ class CCUDAHandler : public core::IReferenceCounted
 			,cuMemRelease
 			,cuMemSetAccess
 			,cuMemImportFromShareableHandle
+			,cuLaunchHostFunc
+			,cuDestroyExternalSemaphore
+			,cuImportExternalSemaphore
+			,cuSignalExternalSemaphoresAsync
 		);
 		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
 
diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index ef9bef6588..d2ff049dfd 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -19,6 +19,15 @@ namespace nbl::video
 struct NBL_API2 ICleanup
 {
     virtual ~ICleanup() = 0;
+
+    std::unique_ptr<ICleanup> next;
+
+    static void chain(std::unique_ptr<ICleanup>& first, std::unique_ptr<ICleanup>&& next)
+    {
+        if (first)
+            return chain(first->next, std::move(next));
+        first = std::move(next);
+    }
 };
 
 //! Interface from which resources backed by IDeviceMemoryAllocation inherit from
@@ -95,29 +104,9 @@ class IDeviceMemoryBacked : public IBackendObject
         
         void chainPreDestroyCleanup(std::unique_ptr<ICleanup> first)
         {
-            if (!m_cachedCreationParams.preDestroyCleanup)
-            {
-                m_cachedCreationParams.preDestroyCleanup = std::move(first);
-                return;
-            }
-
-            struct SChainedCleanup : ICleanup
-            {
-                std::unique_ptr<ICleanup> first, next;
-                SChainedCleanup(std::unique_ptr<ICleanup>&& first, std::unique_ptr<ICleanup>&& next)
-                    : first(std::move(first))
-                    , next(std::move(next)) 
-                { }
-                ~SChainedCleanup()
-                {
-                    first = nullptr;
-                    next = nullptr;
-                }
-            };
-
-            m_cachedCreationParams.preDestroyCleanup = std::make_unique<SChainedCleanup>(std::move(first), std::move(m_cachedCreationParams.preDestroyCleanup));
+            ICleanup::chain(m_cachedCreationParams.preDestroyCleanup, std::move(first));
         }
-
+        
     protected:
         inline IDeviceMemoryBacked(core::smart_refctd_ptr<const ILogicalDevice>&& originDevice, SCreationParams&& creationParams, const SDeviceMemoryRequirements& reqs)
             : IBackendObject(std::move(originDevice)), m_cachedCreationParams(std::move(creationParams)), m_cachedMemoryReqs(reqs) {}
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index cb6dc31728..c83fb562ba 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -17,7 +17,7 @@ CCUDADevice::CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConn
 }
 
 
-CUresult CCUDADevice::reserveAdrressAndMapMemory(size_t size, size_t alignment, CUmemGenericAllocationHandle memory, CUdeviceptr* outPtr)
+CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory)
 {
 	auto& cu = m_handler->getCUDAFunctionTable();
 	
@@ -50,16 +50,25 @@ CUresult CCUDADevice::reserveAdrressAndMapMemory(size_t size, size_t alignment,
 	return CUDA_SUCCESS;
 }
 
-CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory mem)
+CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory* mem)
 {
 	auto& cu = m_handler->getCUDAFunctionTable();
-	if (auto err = cu.pcuMemUnmap(mem.ptr, mem.size); CUDA_SUCCESS != err) return err;
-	if (auto err = cu.pcuMemAddressFree(mem.ptr, mem.size); CUDA_SUCCESS != err) return err;
-	if (auto err = cu.pcuMemRelease(mem.memory); CUDA_SUCCESS != err) return err;
-	CloseHandle(mem.osHandle);
+	if (auto err = cu.pcuMemUnmap(mem->ptr, mem->size); CUDA_SUCCESS != err) return err;
+	if (auto err = cu.pcuMemAddressFree(mem->ptr, mem->size); CUDA_SUCCESS != err) return err;
+	if (auto err = cu.pcuMemRelease(mem->memory); CUDA_SUCCESS != err) return err;
+	CloseHandle(mem->osHandle);
+	return CUDA_SUCCESS;
 }
 
-CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSharedCUDAMemory* outMem)
+CUresult CCUDADevice::destroyExternalSemaphore(SExternalCUDASemaphore* sema)
+{
+	auto& cu = m_handler->getCUDAFunctionTable();
+	if (auto err = cu.pcuDestroyExternalSemaphore(sema->semaphore); CUDA_SUCCESS != err) return err;
+	CloseHandle(sema->osHandle);
+	return CUDA_SUCCESS;
+}
+
+CUresult CCUDADevice::createExportableMemory(core::smart_refctd_ptr<SSharedCUDAMemory>* outMem, size_t size, size_t alignment)
 {
 	if (!outMem)
 		return CUDA_ERROR_INVALID_VALUE;
@@ -93,24 +102,32 @@ CUresult CCUDADevice::createExportableMemory(size_t size, size_t alignment, SSha
 		return err;
 	}
 
-	if (auto err = reserveAdrressAndMapMemory(size, alignment, mem, &ptr); CUDA_SUCCESS != err)
+	if (auto err = reserveAdrressAndMapMemory(&ptr, size, alignment, mem); CUDA_SUCCESS != err)
 	{
 		CloseHandle(handle);
 		cu.pcuMemRelease(mem);
 		return err;
 	}
 
-	outMem->size = size;
-	outMem->memory = mem;
-	outMem->ptr = ptr;
-	outMem->osHandle = handle;
+	*outMem = core::make_smart_refctd_ptr<SSharedCUDAMemory>(core::smart_refctd_ptr<CCUDADevice>(this), size, ptr, mem, handle);
+
 	return CUDA_SUCCESS;
 }
 
-core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemory mem, ILogicalDevice* device)
+CCUDADevice::SSharedCUDAMemory::~SSharedCUDAMemory()
 {
+	device->releaseExportableMemory(this);
+}
+
+CCUDADevice::SExternalCUDASemaphore::~SExternalCUDASemaphore()
+{
+	device->destroyExternalSemaphore(this);
+}
 
-	if (!device || !mem.memory || !mem.osHandle || !mem.ptr || !mem.size)
+core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device)
+{
+
+	if (!device || !mem || !mem->memory || !mem->osHandle || !mem->ptr || !mem->size)
 		return nullptr;
 
 	{
@@ -123,9 +140,17 @@ core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemor
 			return nullptr;
 	}
 
-	auto buf = device->createBuffer(
-		{ {.size = mem.size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT },
-		{ {.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = mem.osHandle}}});
+	auto buf = device->createBuffer(IGPUBuffer::SCreationParams {
+		asset::IBuffer::SCreationParams{
+			.size = mem->size, 
+			.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT 
+		},
+		IDeviceMemoryBacked::SCreationParams{ 
+			IDeviceMemoryBacked::SCachedCreationParams{
+				.externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32,
+				.externalHandle = mem->osHandle
+			}
+		}});
 	
 	auto req = buf->getMemoryReqs();
 	req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
@@ -134,22 +159,22 @@ core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemor
 	if (!(allocation.memory && allocation.offset != ILogicalDevice::InvalidMemoryOffset))
 		return nullptr;
 
-	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(mem, core::smart_refctd_ptr<video::CCUDADevice>(this)));
+	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(core::smart_refctd_ptr<SSharedCUDAMemory>(mem)));
 	return buf;
 }
 
-CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr)
+CUresult CCUDADevice::importGPUBuffer(core::smart_refctd_ptr<SSharedCUDAMemory>* outPtr, IGPUBuffer* buf)
 {
 	if (!buf || !outPtr)
 		return CUDA_ERROR_INVALID_VALUE;
 
 	auto& params = buf->getCachedCreationParams();
 
-	if (!params.externalMemoryHandType.value)
+	if (!params.externalHandleType.value)
 		return CUDA_ERROR_INVALID_VALUE;
 
 	CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
-		.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
+		.type = static_cast<CUexternalMemoryHandleType>(params.externalHandleType.value),
 		.handle = {.win32 = {.handle = buf->getExternalHandle()}},
 		.size = buf->getMemoryReqs().size,
 	};
@@ -160,22 +185,47 @@ CUresult CCUDADevice::importGPUBuffer(IGPUBuffer* buf, SSharedCUDAMemory* outPtr
 
 	auto& cu = m_handler->getCUDAFunctionTable();
 	if (auto err = cu.pcuMemImportFromShareableHandle(&mem, buf->getExternalHandle(),
-		static_cast<CUmemAllocationHandleType>(params.externalMemoryHandType.value)); 
+		static_cast<CUmemAllocationHandleType>(params.externalHandleType.value));
 		CUDA_SUCCESS != err)
 		return err;
 
-	if(auto err = reserveAdrressAndMapMemory(buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem, &ptr))
+	if(auto err = reserveAdrressAndMapMemory(&ptr, buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem))
 	{
 		cu.pcuMemRelease(mem);
 		return err;
 	}
 
-	outPtr->ptr = ptr;
-	outPtr->memory = mem;
-	outPtr->size = buf->getSize();
-	outPtr->osHandle = handle;
+	*outPtr = core::make_smart_refctd_ptr<SSharedCUDAMemory>(
+		core::smart_refctd_ptr<CCUDADevice>(this),
+		buf->getSize(), ptr, mem, handle);
+	
+	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(*outPtr));
+	return CUDA_SUCCESS;
+}
+
+CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr<SExternalCUDASemaphore>* outPtr, IGPUSemaphore* sema)
+{
+	if (!sema || !outPtr)
+		return CUDA_ERROR_INVALID_VALUE;
+
+	auto& cu = m_handler->getCUDAFunctionTable();
+	auto handleType = sema->getCreationParams().externalHandleType.value;
+	auto handle = sema->getCreationParams().externalHandle;
 
-	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(*outPtr, core::smart_refctd_ptr<video::CCUDADevice>(this)));
+	if (!handleType || !handle)
+		return CUDA_ERROR_INVALID_VALUE;
+
+	CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = {
+		.type = static_cast<CUexternalSemaphoreHandleType>(handleType),
+		.handle = {.win32 = {.handle = handle }},
+	};
+
+	CUexternalSemaphore cusema;
+	if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err)
+		return err;
+	
+	*outPtr = core::make_smart_refctd_ptr<SExternalCUDASemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), cusema, handle);
+	sema->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(*outPtr));
 	return CUDA_SUCCESS;
 }
 
diff --git a/src/nbl/video/CVulkanBuffer.cpp b/src/nbl/video/CVulkanBuffer.cpp
index c24e13ceb8..2e542944d3 100644
--- a/src/nbl/video/CVulkanBuffer.cpp
+++ b/src/nbl/video/CVulkanBuffer.cpp
@@ -8,7 +8,7 @@ namespace nbl::video
 CVulkanBuffer::~CVulkanBuffer()
 {
 	preDestroyStep();
-	if (m_cachedCreationParams.skipHandleDestroy)
+	if (!m_cachedCreationParams.skipHandleDestroy)
 	{
 		const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
 		auto* vk = vulkanDevice->getFunctionTable();

From 2f7b517dd7e59b070e0609127448e4fa4565a2a5 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 15 Jul 2023 23:21:31 +0300
Subject: [PATCH 08/62] get cuda interop working in vulkan_1_3 branch

---
 examples_tests                                |   2 +-
 include/nbl/video/CCUDADevice.h               | 189 ++---------
 include/nbl/video/CCUDAHandler.h              |  14 +-
 include/nbl/video/CCUDASharedMemory.h         |  74 +++++
 include/nbl/video/CCUDASharedSemaphore.h      |  49 +++
 include/nbl/video/IDeviceMemoryAllocation.h   |  95 ++++--
 include/nbl/video/IDeviceMemoryAllocator.h    |  51 ++-
 include/nbl/video/IDeviceMemoryBacked.h       |   3 +
 include/nbl/video/ILogicalDevice.h            |   4 +-
 include/nbl/video/ISemaphore.h                |  41 ++-
 include/nbl/video/SPhysicalDeviceLimits.h     |   3 +
 src/nbl/CMakeLists.txt                        |   7 +-
 .../asset/interchange/CPLYMeshFileLoader.cpp  |   7 +-
 .../asset/interchange/CSTLMeshFileLoader.cpp  |   8 +-
 src/nbl/video/CCUDADevice.cpp                 | 306 ++++--------------
 src/nbl/video/CCUDAHandler.cpp                |  15 +-
 src/nbl/video/CCUDASharedMemory.cpp           | 109 +++++++
 src/nbl/video/CCUDASharedSemaphore.cpp        |  18 ++
 src/nbl/video/CVulkanCommandBuffer.cpp        |   2 +-
 src/nbl/video/CVulkanLogicalDevice.cpp        |  70 +++-
 src/nbl/video/CVulkanLogicalDevice.h          |   2 +-
 src/nbl/video/CVulkanMemoryAllocation.cpp     |  14 +-
 src/nbl/video/CVulkanMemoryAllocation.h       |   7 +-
 src/nbl/video/CVulkanPhysicalDevice.cpp       |   3 +
 src/nbl/video/CVulkanQueue.cpp                |   2 +-
 src/nbl/video/CVulkanSemaphore.h              |   7 +-
 src/nbl/video/IGPUCommandBuffer.cpp           |   4 +-
 27 files changed, 623 insertions(+), 483 deletions(-)
 create mode 100644 include/nbl/video/CCUDASharedMemory.h
 create mode 100644 include/nbl/video/CCUDASharedSemaphore.h
 create mode 100644 src/nbl/video/CCUDASharedMemory.cpp
 create mode 100644 src/nbl/video/CCUDASharedSemaphore.cpp

diff --git a/examples_tests b/examples_tests
index faddda46b2..6ce21d5c5c 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit faddda46b285b433c2926b384064bd80a6889b43
+Subproject commit 6ce21d5c5c8026b6772f3e60e21096ee54353a81
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 26005fcba3..7b2b952548 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -6,7 +6,8 @@
 
 
 #include "nbl/video/IPhysicalDevice.h"
-
+#include "nbl/video/CCUDASharedMemory.h"
+#include "nbl/video/CCUDASharedSemaphore.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
@@ -23,10 +24,20 @@
 namespace nbl::video
 {
 class CCUDAHandler;
+class CCUDASharedMemory;
+class CCUDASharedSemaphore;
 
 class CCUDADevice : public core::IReferenceCounted
 {
     public:
+#ifdef _WIN32
+		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32;
+		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32;
+#else
+		static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD;
+		static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
+
 		enum E_VIRTUAL_ARCHITECTURE
 		{
 			EVA_30,
@@ -72,181 +83,45 @@ class CCUDADevice : public core::IReferenceCounted
 		// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability
 		// Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions)
 		// Also maybe separate this out into its own `CCUDA` class instead of nesting it here?
-#if 0
-		template<typename ObjType>
-		struct GraphicsAPIObjLink
-		{
-				GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false)
-				{
-					asImage = {nullptr};
-				}
-				GraphicsAPIObjLink(core::smart_refctd_ptr<ObjType>&& _obj) : GraphicsAPIObjLink()
-				{
-					obj = std::move(_obj);
-				}
-				GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink()
-				{
-					operator=(std::move(other));
-				}
-
-				GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete;
-				GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete;
-				GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other)
-				{
-					std::swap(obj,other.obj);
-					std::swap(cudaHandle,other.cudaHandle);
-					std::swap(acquired,other.acquired);
-					std::swap(asImage,other.asImage);
-					return *this;
-				}
-
-				~GraphicsAPIObjLink()
-				{
-					assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream
-					if (obj)
-						CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle);
-				}
-
-				//
-				auto* getObject() const {return obj.get();}
-
-			private:
-				core::smart_refctd_ptr<ObjType> obj;
-				CUgraphicsResource cudaHandle;
-				bool acquired;
-
-				friend class CCUDAHandler;
-			public:
-				union
-				{
-					struct
-					{
-						CUdeviceptr pointer;
-					} asBuffer;
-					struct
-					{
-						CUmipmappedArray mipmappedArray;
-						CUarray array;
-					} asImage;
-				};
-		};
-
-		//
-		static CUresult registerBuffer(GraphicsAPIObjLink<video::IGPUBuffer>* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE);
-		static CUresult registerImage(GraphicsAPIObjLink<video::IGPUImage>* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE);
-		
 
-		template<typename ObjType>
-		static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink<ObjType>* linksBegin, GraphicsAPIObjLink<ObjType>* linksEnd, CUstream stream)
-		{
-			auto count = std::distance(linksBegin,linksEnd);
-
-			auto resources = reinterpret_cast<CUgraphicsResource*>(tmpStorage);
-			auto rit = resources;
-			for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++)
-			{
-				if (iit->acquired)
-					return CUDA_ERROR_UNKNOWN;
-				*rit = iit->cudaHandle;
-			}
-
-			auto retval = cuda.pcuGraphicsMapResources(count,resources,stream);
-			for (auto iit=linksBegin; iit!=linksEnd; iit++)
-				iit->acquired = true;
-			return retval;
-		}
-		template<typename ObjType>
-		static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink<ObjType>* linksBegin, GraphicsAPIObjLink<ObjType>* linksEnd, CUstream stream)
-		{
-			auto count = std::distance(linksBegin,linksEnd);
-
-			auto resources = reinterpret_cast<CUgraphicsResource*>(tmpStorage);
-			auto rit = resources;
-			for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++)
-			{
-				if (!iit->acquired)
-					return CUDA_ERROR_UNKNOWN;
-				*rit = iit->cudaHandle;
-			}
-
-			auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream);
-			for (auto iit=linksBegin; iit!=linksEnd; iit++)
-				iit->acquired = false;
-			return retval;
-		}
-
-		static CUresult acquireAndGetPointers(GraphicsAPIObjLink<video::IGPUBuffer>* linksBegin, GraphicsAPIObjLink<video::IGPUBuffer>* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr);
-		static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, CUstream stream);
-		static CUresult acquireAndGetArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream);
-#endif
 		CUdevice getInternalObject() const { return m_handle; }
 		const CCUDAHandler* getHandler() const { return m_handler.get();  }
-
-		struct SSharedCUDAMemory : core::IReferenceCounted
-		{
-			core::smart_refctd_ptr<CCUDADevice> device;
-			size_t size;
-			CUdeviceptr ptr;
-			CUmemGenericAllocationHandle memory;
-			void* osHandle;
-			SSharedCUDAMemory(core::smart_refctd_ptr<CCUDADevice> device, size_t size, CUdeviceptr ptr, CUmemGenericAllocationHandle memory, void* osHandle)
-				: device(std::move(device))
-				, size(size)
-				, ptr(ptr)
-				, memory(memory)
-				, osHandle(osHandle)
-			{}
-			~SSharedCUDAMemory() override;
-		};
-
-		struct SExternalCUDASemaphore : core::IReferenceCounted
-		{
-			core::smart_refctd_ptr<CCUDADevice> device;
-			CUexternalSemaphore semaphore;
-			void* osHandle;
-			SExternalCUDASemaphore(core::smart_refctd_ptr<CCUDADevice> device, CUexternalSemaphore semaphore, void* osHandle)
-				: device(std::move(device))
-				, semaphore(semaphore)
-				, osHandle(osHandle)
-			{}
-			~SExternalCUDASemaphore() override;
-		};
-
-		core::smart_refctd_ptr<IGPUBuffer> exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device);
-		CUresult importGPUBuffer(core::smart_refctd_ptr<SSharedCUDAMemory>* outPtr, IGPUBuffer* buf);
-		CUresult importGPUSemaphore(core::smart_refctd_ptr<SExternalCUDASemaphore>* outPtr, IGPUSemaphore* sem);
-		CUresult createExportableMemory(core::smart_refctd_ptr<SSharedCUDAMemory>* outMem, size_t size, size_t alignment);
+		CUresult importGPUSemaphore(core::smart_refctd_ptr<CCUDASharedSemaphore>* outPtr, ISemaphore* sem);
+		CUresult createSharedMemory(core::smart_refctd_ptr<CCUDASharedMemory>* outMem, struct CCUDASharedMemory::SCreationParams&& inParams);
+		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); }
 		
+		size_t roundToGranularity(CUmemLocationType location, size_t size) const;
+
 	protected:
-		friend struct SSharedCUDAMemory;
-		CUresult releaseExportableMemory(SSharedCUDAMemory* mem);
-		CUresult destroyExternalSemaphore(SExternalCUDASemaphore* sema);
+		CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory);
+
+		friend class CCUDAHandler;
+		friend class CCUDASharedMemory;
+		friend class CCUDASharedSemaphore;
 
 		struct SCUDACleaner : video::ICleanup
 		{
-			core::smart_refctd_ptr<core::IReferenceCounted> resource;
-			SCUDACleaner(core::smart_refctd_ptr<core::IReferenceCounted> resource)
+			core::smart_refctd_ptr<const core::IReferenceCounted> resource;
+			SCUDACleaner(core::smart_refctd_ptr<const core::IReferenceCounted> resource)
 				: resource(std::move(resource))
 			{ }
-			~SCUDACleaner() override
-			{
-				resource = nullptr;
-			}
 		};
-
-		CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory);
-		friend class CCUDAHandler;
-		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture);
-		~CCUDADevice() = default;
+		
+		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr<CCUDAHandler>&& _handler);
+		~CCUDADevice();
 		
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
 		IPhysicalDevice* const m_vulkanDevice;
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
+		core::smart_refctd_ptr<CCUDAHandler> m_handler;
+		CUdevice m_handle;
+		CUcontext m_context;
+		size_t m_allocationGranularity[4];
 };
 
 }
 
 #endif // _NBL_COMPILE_WITH_CUDA_
 
-#endif
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index b6f98385bb..dbad47877d 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -34,7 +34,7 @@ class CCUDAHandler : public core::IReferenceCounted
 		static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
 		//
-		core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
+		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
 		//
 		using LibLoader = system::DefaultFuncPtrLoader;
@@ -174,9 +174,9 @@ class CCUDAHandler : public core::IReferenceCounted
 			const auto filesize = file->getSize();
 			std::string source(filesize+1u,'0');
 
-			system::future<size_t> bytesRead;
+			system::IFile::success_t bytesRead;
 			file->read(bytesRead,source.data(),0u,file->getSize());
-			source.resize(bytesRead.get());
+			source.resize(bytesRead.getBytesProcessed());
 
 			return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
 		}
@@ -243,8 +243,7 @@ class CCUDAHandler : public core::IReferenceCounted
 		}
 
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
-
-	protected:
+protected:
 		CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version)
 			: m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version)
 		{
@@ -256,7 +255,8 @@ class CCUDAHandler : public core::IReferenceCounted
 			}
 		}
 		~CCUDAHandler() = default;
-		
+
+
 		//
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
 		{
@@ -289,4 +289,4 @@ class CCUDAHandler : public core::IReferenceCounted
 
 #endif // _NBL_COMPILE_WITH_CUDA_
 
-#endif
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h
new file mode 100644
index 0000000000..9b3e4a0551
--- /dev/null
+++ b/include/nbl/video/CCUDASharedMemory.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_
+#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_
+
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+
+#include "cuda.h"
+#include "nvrtc.h"
+#if CUDA_VERSION < 9000
+	#error "Need CUDA 9.0 SDK or higher."
+#endif
+
+// useful includes in the future
+//#include "cudaEGL.h"
+//#include "cudaVDPAU.h"
+
+namespace nbl::video
+{
+
+class CCUDAMemoryMapping: public core::IReferenceCounted
+{
+};
+
+class CCUDASharedMemory : public core::IReferenceCounted
+{
+public:
+    friend class CCUDADevice;
+
+    CUdeviceptr getDeviceptr() const { return m_params.ptr;  }
+
+    struct SCreationParams
+    {
+        size_t            size;
+        uint32_t          alignment;
+        CUmemLocationType location;
+    };
+
+    struct SCachedCreationParams : SCreationParams
+    {
+        size_t granularSize;
+        CUdeviceptr ptr;
+        union
+        {
+            void* osHandle;
+            int fd;
+        };
+    };
+
+    const SCreationParams& getCreationParams() const { return m_params; }
+
+    core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
+
+    core::smart_refctd_ptr<IGPUImage>  exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const;
+
+protected:
+
+    CCUDASharedMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params)
+        : m_device(std::move(device))
+        , m_params(std::move(params))
+    {}
+    ~CCUDASharedMemory() override;
+
+    core::smart_refctd_ptr<CCUDADevice> m_device;
+    SCachedCreationParams m_params;
+};
+
+}
+
+#endif // _NBL_COMPILE_WITH_CUDA_
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h
new file mode 100644
index 0000000000..882e794bd4
--- /dev/null
+++ b/include/nbl/video/CCUDASharedSemaphore.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_
+#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+
+#include "cuda.h"
+#include "nvrtc.h"
+#if CUDA_VERSION < 9000
+	#error "Need CUDA 9.0 SDK or higher."
+#endif
+
+// useful includes in the future
+//#include "cudaEGL.h"
+//#include "cudaVDPAU.h"
+
+namespace nbl::video
+{
+
+class CCUDASharedSemaphore : public core::IReferenceCounted
+{
+public:
+    friend class CCUDADevice;
+
+    CUexternalSemaphore getInternalObject() const { return m_handle; }
+
+protected:
+   
+    CCUDASharedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, CUexternalSemaphore semaphore, void* osHandle)
+        : m_device(std::move(device))
+        , m_src(std::move(m_src))
+        , m_handle(semaphore)
+        , m_osHandle(osHandle)
+    {}
+    ~CCUDASharedSemaphore() override;
+
+    core::smart_refctd_ptr<CCUDADevice> m_device;
+    core::smart_refctd_ptr<ISemaphore> m_src;
+    CUexternalSemaphore m_handle;
+    void* m_osHandle;
+};
+
+}
+
+#endif // _NBL_COMPILE_WITH_CUDA_
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h
index 7365fa6339..7074f8861b 100644
--- a/include/nbl/video/IDeviceMemoryAllocation.h
+++ b/include/nbl/video/IDeviceMemoryAllocation.h
@@ -24,6 +24,8 @@ We only support persistently mapped buffers with ARB_buffer_storage.
 Please don't ask us to support Buffer Orphaning. */
 class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
 {
+    friend class IDeviceMemoryAllocator;
+    friend class ILogicalDevice;
     public:
         //! Access flags for how the application plans to use mapped memory (if any)
         /** When you create the memory you can allow for it to be mapped (be given a pointer)
@@ -68,6 +70,43 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
             EMHF_MULTI_INSTANCE_BIT = 0x00000002,
         };
 
+        //! Flags for imported/exported allocation
+        enum E_EXTERNAL_HANDLE_TYPE : uint32_t
+        {
+            EHT_NONE = 0,
+            EHT_OPAQUE_WIN32 = 0x00000002,
+            EHT_OPAQUE_WIN32_KMT = 0x00000004,
+            EHT_D3D11_TEXTURE = 0x00000008,
+            EHT_D3D11_TEXTURE_KMT = 0x00000010,
+            EHT_D3D12_HEAP = 0x00000020,
+            EHT_D3D12_RESOURCE = 0x00000040,
+            EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100,
+        };
+
+        /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */
+        struct SExternalMemoryProperties
+        {
+            uint32_t exportableTypes : 7 = ~0u;
+            uint32_t compatibleTypes : 7 = ~0u;
+            uint32_t dedicatedOnly : 1 = 0u;
+            uint32_t exportable : 1 = ~0u;
+            uint32_t importable : 1 = ~0u;
+
+            bool operator == (SExternalMemoryProperties const& rhs) const = default;
+
+            SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const
+            {
+                rhs.exportableTypes &= exportableTypes;
+                rhs.compatibleTypes &= compatibleTypes;
+                rhs.dedicatedOnly |= dedicatedOnly;
+                rhs.exportable &= exportable;
+                rhs.importable &= importable;
+                return rhs;
+            }
+        };
+
+        static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t));
+
         //
         const ILogicalDevice* getOriginDevice() const {return m_originDevice;}
 
@@ -75,25 +114,25 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
         E_API_TYPE getAPIType() const;
 
         //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource.
-        inline bool isDedicated() const {return m_dedicated;}
+        inline bool isDedicated() const {return m_params.dedicated;}
 
         //! Returns the size of the memory allocation
-        inline size_t getAllocationSize() const {return m_allocationSize;}
+        inline size_t getAllocationSize() const {return m_params.allocationSize;}
 
         //!
-        inline core::bitflag<E_MEMORY_ALLOCATE_FLAGS> getAllocateFlags() const { return m_allocateFlags; }
+        inline core::bitflag<E_MEMORY_ALLOCATE_FLAGS> getAllocateFlags() const { return m_params.allocateFlags; }
 
         //!
-        inline core::bitflag<E_MEMORY_PROPERTY_FLAGS> getMemoryPropertyFlags() const { return m_memoryPropertyFlags; }
+        inline core::bitflag<E_MEMORY_PROPERTY_FLAGS> getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; }
 
         //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr)
-        inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);}
+        inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);}
         //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor
         /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more)
         for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */
         inline bool haveToMakeVisible() const
         {
-            return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT);
+            return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT);
         }
 
         //!
@@ -106,9 +145,9 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
         {
             if (isCurrentlyMapped())
                 return false;
-            if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT))
+            if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT))
                 return false;
-            if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT))
+            if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT))
                 return false;
             m_mappedPtr = reinterpret_cast<uint8_t*>(map_impl(range,accessHint));
             if (m_mappedPtr)
@@ -149,23 +188,41 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
         //! Constant variant of getMappedPointer
         inline const void* getMappedPointer() const { return m_mappedPtr; }
 
+        struct SCreationParams
+        {
+            core::bitflag<E_MEMORY_ALLOCATE_FLAGS> allocateFlags = E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE;
+            core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE;
+            E_EXTERNAL_HANDLE_TYPE externalHandleType = E_EXTERNAL_HANDLE_TYPE::EHT_NONE;
+            void* externalHandle = nullptr;
+            const bool dedicated = false;
+            const size_t allocationSize;
+        };
+
     protected:
-        inline IDeviceMemoryAllocation(
-            const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag<E_MEMORY_ALLOCATE_FLAGS> allocateFlags, const core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags, const bool dedicated
-        ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {}
+        inline void setPostDestroyCleanup(std::unique_ptr<struct ICleanup>&& cleanup)
+        {
+            m_postDestroyCleanup = std::move(cleanup);
+        }
+
+        IDeviceMemoryAllocation(
+            const ILogicalDevice* originDevice, SCreationParams&& params = {})
+            : m_originDevice(originDevice)
+            , m_mappedPtr(nullptr)
+            , m_mappedRange{ 0, 0 }
+            , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS)
+            , m_params(std::move(params))
+        {}
 
         virtual void* map_impl(const MemoryRange& range, const core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> accessHint) = 0;
         virtual bool unmap_impl() = 0;
 
 
-        const ILogicalDevice* const m_originDevice;
-        const size_t m_allocationSize;
-        uint8_t* m_mappedPtr = nullptr;
-        MemoryRange m_mappedRange = {};
-        core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS;
-        const core::bitflag<E_MEMORY_ALLOCATE_FLAGS> m_allocateFlags;
-        const core::bitflag<E_MEMORY_PROPERTY_FLAGS> m_memoryPropertyFlags;
-        const bool m_dedicated;
+        const ILogicalDevice* m_originDevice = nullptr;
+        uint8_t* m_mappedPtr;
+        MemoryRange m_mappedRange;
+        core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> m_currentMappingAccess;
+        SCreationParams m_params;
+        std::unique_ptr<struct ICleanup> m_postDestroyCleanup = nullptr;
 };
 
 NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS)
diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h
index 0712ec24f6..408efd6da4 100644
--- a/include/nbl/video/IDeviceMemoryAllocator.h
+++ b/include/nbl/video/IDeviceMemoryAllocator.h
@@ -19,6 +19,12 @@ class IDeviceMemoryAllocator
 			size_t memoryTypeIndex : 5 = 0u;
 			IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan
 			// size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications.
+
+			// Handle Type for external resources
+			IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE;
+			//! Imports the given handle  if externalHandle != nullptr && externalHandleType != EHT_NONE
+			//! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE
+			void* externalHandle = nullptr;
 		};
 
 		//! IMemoryTypeIterator extracts memoryType indices from memoryTypeBits in arbitrary order
@@ -27,8 +33,15 @@ class IDeviceMemoryAllocator
 		class IMemoryTypeIterator
 		{
 			public:
-				IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags)
-					: m_allocateFlags(static_cast<uint32_t>(allocateFlags.value)), m_reqs(reqs) {}
+				IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs,
+					core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags,
+					IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType,
+					void* handle)
+					: m_allocateFlags(static_cast<uint32_t>(allocateFlags.value))
+					, m_reqs(reqs)
+					, m_handleType(handleType)
+					, m_handle(handle)
+				{}
 
 				static inline uint32_t end() {return 32u;}
 
@@ -40,11 +53,13 @@ class IDeviceMemoryAllocator
 
 				inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication)
 				{
-					SAllocateInfo ret;
+					SAllocateInfo ret = {};
 					ret.size = m_reqs.size;
 					ret.flags = m_allocateFlags;
 					ret.memoryTypeIndex = dereference();
 					ret.dedication = dedication;
+					ret.externalHandleType = m_handleType;
+					ret.externalHandle = m_handle;
 					return ret;
 				}
 		
@@ -57,17 +72,24 @@ class IDeviceMemoryAllocator
 		
 				IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs;
 				uint32_t m_allocateFlags;
+				IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType;
+				void* m_handle;
 		};
 
 		//! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB
 		class DefaultMemoryTypeIterator : public IMemoryTypeIterator
 		{
 			public:
-				DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags)
+				DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs,
+					core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags,
+					IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType,
+					void* handle)
+					: IMemoryTypeIterator(reqs, allocateFlags, handleType, handle)
 				{
 					currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits);
 				}
 
+
 			protected:
 				uint32_t dereference() const override
 				{
@@ -100,19 +122,26 @@ class IDeviceMemoryAllocator
 		};
 		virtual SAllocation allocate(const SAllocateInfo& info) = 0;
 
-		template<class memory_type_iterator_t=DefaultMemoryTypeIterator>
-		inline SAllocation allocate(
-			const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr,
-			const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE)
+		template<class memory_type_iterator_t = DefaultMemoryTypeIterator>
+		SAllocation allocate(
+			const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs,
+			IDeviceMemoryBacked* dedication = nullptr,
+			const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
+			IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType = IDeviceMemoryAllocation::EHT_NONE,
+			void* handle = nullptr,
+			std::unique_ptr<struct ICleanup>&& postDestroyCleanup = nullptr)
 		{
-			for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt)
+			for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, handleType, handle); memTypeIt != IMemoryTypeIterator::end(); ++memTypeIt)
 			{
 				SAllocateInfo allocateInfo = memTypeIt.operator()(dedication);
-				auto allocation = allocate(allocateInfo);
+				SAllocation allocation = allocate(allocateInfo);
 				if (allocation.isValid())
+				{
+					allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup));
 					return allocation;
+				}
 			}
-			return {};
+			return { };
 		}
 };
 
diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index d2ff049dfd..278e681a35 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -46,6 +46,8 @@ class IDeviceMemoryBacked : public IBackendObject
             // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects
             bool skipHandleDestroy = false;
 
+            core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE;
+
             //! If you specify queue family indices, then you're concurrent sharing
             inline bool isConcurrentSharing() const
             {
@@ -125,6 +127,7 @@ class IDeviceMemoryBacked : public IBackendObject
         //! members
         SCachedCreationParams m_cachedCreationParams;
         SDeviceMemoryRequirements m_cachedMemoryReqs;
+        void* m_cachedExternalHandle = nullptr;
 };
 
 } // end namespace nbl::video
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 278390939d..d4cdc6fd99 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -85,7 +85,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         bool supportsMask(const uint32_t queueFamilyIndex, core::bitflag<asset::ACCESS_FLAGS> accessMask) const;
 
         //! NOTE/TODO: this is not yet finished
-        inline bool validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const;
+        bool validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const;
         inline bool validateMemoryBarrier(const uint32_t queueFamilyIndex, const IGPUCommandBuffer::SOwnershipTransferBarrier& barrier, const bool concurrentSharing) const
         {
             // implicitly satisfied by our API:
@@ -147,7 +147,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual IQueue::RESULT waitIdle() const = 0;
 
         //! Semaphore Stuff
-        virtual core::smart_refctd_ptr<ISemaphore> createSemaphore(const uint64_t initialValue) = 0;
+        virtual core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&& ) = 0;
         //
         struct SSemaphoreWaitInfo
         {
diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h
index dae0efe1bf..768fe1a66d 100644
--- a/include/nbl/video/ISemaphore.h
+++ b/include/nbl/video/ISemaphore.h
@@ -26,9 +26,48 @@ class ISemaphore : public IBackendObject
         // Vulkan: const VkSemaphore*
         virtual const void* getNativeHandle() const = 0;
 
+        //! Flags for imported/exported allocation
+        enum E_EXTERNAL_HANDLE_TYPE : uint32_t
+        {
+            EHT_NONE = 0x00000000,
+            EHT_OPAQUE_FD = 0x00000001,
+            EHT_OPAQUE_WIN32 = 0x00000002,
+            EHT_OPAQUE_WIN32_KMT = 0x00000004,
+            EHT_D3D12_FENCE = 0x00000008,
+            EHT_SYNC_FD = 0x00000010,
+        };
+
+        //!
+        struct SCreationParams
+        {
+            // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources
+            std::unique_ptr<ICleanup> preDestroyCleanup = nullptr;
+            // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources
+            std::unique_ptr<ICleanup> postDestroyCleanup = nullptr;
+            // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects
+            bool skipHandleDestroy = false;
+            // Handle Type for external resources
+            core::bitflag<E_EXTERNAL_HANDLE_TYPE> externalHandleTypes = EHT_NONE;
+            //! Imports the given handle  if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE
+            //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE
+            void* externalHandle = nullptr;
+
+            uint64_t initialValue = 0;
+        };
+
+        auto const& getCreationParams() const
+        {
+            return m_creationParams;
+        }
+
     protected:
-        inline ISemaphore(core::smart_refctd_ptr<const ILogicalDevice>&& dev) : IBackendObject(std::move(dev)) {}
+        ISemaphore(core::smart_refctd_ptr<const ILogicalDevice>&& dev, SCreationParams&& params = {})
+            : IBackendObject(std::move(dev))
+            , m_creationParams(std::move(params))
+        {}
         virtual ~ISemaphore() = default;
+
+        SCreationParams m_creationParams;
 };
 
 }
diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h
index fe263aed84..7f58a67443 100644
--- a/include/nbl/video/SPhysicalDeviceLimits.h
+++ b/include/nbl/video/SPhysicalDeviceLimits.h
@@ -552,6 +552,9 @@ struct SPhysicalDeviceLimits
     /* CooperativeMatrixPropertiesKHR  *//* VK_KHR_cooperative_matrix */
     core::bitflag<asset::IShader::E_SHADER_STAGE> cooperativeMatrixSupportedStages = asset::IShader::ESS_UNKNOWN;
 
+    bool externalFenceWin32 = false; /* VK_KHR_external_fence_win32 */ // [TODO] requires instance extensions, add them
+    bool externalMemoryWin32 = false; /* VK_KHR_external_memory_win32 */ // [TODO] requires instance extensions, add them
+    bool externalSemaphoreWin32 = false; /* VK_KHR_external_semaphore_win32 */ // [TODO] requires instance extensions, add them
 
     /*  Always enabled if available, reported as limits */
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index ac0aa0c42d..dc9f4e7bef 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -119,7 +119,6 @@ nbl_get_conf_dir(NABLA_CONF_DIR_RELEASE Release)
 nbl_get_conf_dir(NABLA_CONF_DIR_RELWITHDEBINFO RelWithDebInfo)
 
 if (NBL_COMPILE_WITH_CUDA)
-	message(STATUS "Building with CUDA interop")
 	set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA})
 	if (NBL_BUILD_OPTIX)
 		set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX})
@@ -320,6 +319,8 @@ set(NBL_VIDEO_SOURCES
 # CUDA
 	${NBL_ROOT_PATH}/src/nbl/video/CCUDAHandler.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/CCUDADevice.cpp
+	${NBL_ROOT_PATH}/src/nbl/video/CCUDASharedMemory.cpp
+	${NBL_ROOT_PATH}/src/nbl/video/CCUDASharedSemaphore.cpp
 )
 
 set(NBL_SCENE_SOURCES
@@ -384,6 +385,10 @@ endif()
 
 target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__)
 
+if (NBL_COMPILE_WITH_CUDA)
+    target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_)
+endif()
+
 if (ANDROID)
 	add_library(android_native_app_glue STATIC
     	${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c
diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp
index 4ad0710dbf..6eb93d7242 100644
--- a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp
+++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp
@@ -96,6 +96,7 @@ void CPLYMeshFileLoader::initialize()
 		auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
 		if (pipelineBundle.getContents().empty())
 		{
+#if 0 // WHAT IS THIS?
 			auto mbVertexShader = core::smart_refctd_ptr<ICPUSpecializedShader>();
 			auto mbFragmentShader = core::smart_refctd_ptr<ICPUSpecializedShader>();
 			{
@@ -108,6 +109,7 @@ void CPLYMeshFileLoader::initialize()
 				mbVertexShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(vertexShaderBundle->begin()->getContents().begin()[0]);
 				mbFragmentShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(fragmentShaderBundle->begin()->getContents().begin()[0]);
 			}
+#endif
 
 			auto mbPipelineLayout = defaultOverride.findDefaultAsset<ICPUPipelineLayout>("nbl/builtin/pipeline_layout/loader/PLY", fakeContext, 0u).first;
 
@@ -130,7 +132,7 @@ void CPLYMeshFileLoader::initialize()
 				const auto currentBitmask = core::createBitmask({ attrib });
 				inputParams.enabledBindingFlags |= currentBitmask;
 				inputParams.enabledAttribFlags |= currentBitmask;
-				inputParams.bindings[attrib] = { asset::getTexelOrBlockBytesize(static_cast<E_FORMAT>(vertexAttribParamsAllOptions[attrib].format)), EVIR_PER_VERTEX };
+				inputParams.bindings[attrib] = { asset::getTexelOrBlockBytesize(static_cast<E_FORMAT>(vertexAttribParamsAllOptions[attrib].format)), SVertexInputBindingParams::EVIR_PER_VERTEX};
 				inputParams.attributes[attrib] = vertexAttribParamsAllOptions[attrib];
 			}
 		
@@ -143,14 +145,15 @@ void CPLYMeshFileLoader::initialize()
 
 			SRasterizationParams rastarizationParmas;
 
+#if 0 // WHAT IS THIS?
 			auto mbPipeline = core::make_smart_refctd_ptr<ICPURenderpassIndependentPipeline>(std::move(mbPipelineLayout), nullptr, nullptr, inputParams, blendParams, primitiveAssemblyParams, rastarizationParmas);
 			{
 				mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get());
 				mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get());
-			
 				asset::SAssetBundle newPipelineBundle(nullptr, { core::smart_refctd_ptr<asset::ICPURenderpassIndependentPipeline>(mbPipeline) });
 				defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
 			}
+#endif
 		}
 		else
 			return;
diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp
index c080857c63..b507153916 100644
--- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp
+++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp
@@ -52,6 +52,7 @@ void CSTLMeshFileLoader::initialize()
 		auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
 		if (pipelineBundle.getContents().empty())
 		{
+#if 0 // WHAT IS THIS?
 			auto mbVertexShader = core::smart_refctd_ptr<ICPUSpecializedShader>();
 			auto mbFragmentShader = core::smart_refctd_ptr<ICPUSpecializedShader>();
 			{
@@ -64,7 +65,7 @@ void CSTLMeshFileLoader::initialize()
 				mbVertexShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(vertexShaderBundle->begin()->getContents().begin()[0]);
 				mbFragmentShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(fragmentShaderBundle->begin()->getContents().begin()[0]);
 			}
-
+#endif
 			auto defaultOverride = IAssetLoaderOverride(m_assetMgr);
 
 			const IAssetLoader::SAssetLoadContext fakeContext(IAssetLoader::SAssetLoadParams{}, nullptr);
@@ -79,7 +80,7 @@ void CSTLMeshFileLoader::initialize()
 			const auto stride = positionFormatByteSize + colorFormatByteSize + normalFormatByteSize;
 			mbInputParams.enabledBindingFlags |= core::createBitmask({ 0 });
 			mbInputParams.enabledAttribFlags |= core::createBitmask({ POSITION_ATTRIBUTE, NORMAL_ATTRIBUTE, withColorAttribute ? COLOR_ATTRIBUTE : 0 });
-			mbInputParams.bindings[0] = { stride, EVIR_PER_VERTEX };
+			mbInputParams.bindings[0] = { stride, SVertexInputBindingParams::EVIR_PER_VERTEX };
 
 			mbInputParams.attributes[POSITION_ATTRIBUTE].format = EF_R32G32B32_SFLOAT;
 			mbInputParams.attributes[POSITION_ATTRIBUTE].relativeOffset = 0;
@@ -102,14 +103,15 @@ void CSTLMeshFileLoader::initialize()
 
 			SRasterizationParams rastarizationParmas;
 
+#if 0 // WHAT IS THIS?
 			auto mbPipeline = core::make_smart_refctd_ptr<ICPURenderpassIndependentPipeline>(std::move(mbPipelineLayout), nullptr, nullptr, mbInputParams, blendParams, primitiveAssemblyParams, rastarizationParmas);
 			{
 				mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get());
 				mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get());
 			}
-
 			asset::SAssetBundle newPipelineBundle(nullptr, {core::smart_refctd_ptr<asset::ICPURenderpassIndependentPipeline>(mbPipeline)});
 			defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
+#endif
 		}
 		else
 			return;
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index c83fb562ba..39faaaa0ed 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -1,40 +1,68 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/video/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
 
-CCUDADevice::CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture)
-	: m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture)
+CCUDADevice::CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr<CCUDAHandler>&& _handler)
+	: m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture), m_handle(_handle), m_handler(std::move(_handler)), m_allocationGranularity{}
 {
 	m_defaultCompileOptions.push_back("--std=c++14");
 	m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]);
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
+	auto& cu = m_handler->getCUDAFunctionTable();
+	
+	CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle);
+	assert(CUDA_SUCCESS == re);
+	re = cu.pcuCtxSetCurrent(m_context);
+	assert(CUDA_SUCCESS == re);
+
+	for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i)
+	{
+		uint32_t metaData[16] = { 48 };
+		CUmemAllocationProp prop = {
+			.type = CU_MEM_ALLOCATION_TYPE_PINNED,
+			.requestedHandleTypes = ALLOCATION_HANDLE_TYPE,
+			.location = {.type = static_cast<CUmemLocationType>(i), .id = m_handle },
+			.win32HandleMetaData = metaData,
+		};
+		auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+
+		assert(CUDA_SUCCESS == re);
+	}
+
+}
+
+CCUDADevice::~CCUDADevice()
+{
+	m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context);
 }
 
+size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const
+{
+	return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location];
+}
 
-CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemGenericAllocationHandle memory)
+CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
 {
 	auto& cu = m_handler->getCUDAFunctionTable();
 	
 	CUdeviceptr ptr = 0;
 	if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
-	{
 		return err;
-	}
 
 	if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
 		cu.pcuMemAddressFree(ptr, size);
 		return err;
 	}
-
+	
 	CUmemAccessDesc accessDesc = {
-		.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle },
+		.location = { .type = location, .id = m_handle },
 		.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
 	};
 
@@ -50,173 +78,70 @@ CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t siz
 	return CUDA_SUCCESS;
 }
 
-CUresult CCUDADevice::releaseExportableMemory(SSharedCUDAMemory* mem)
-{
-	auto& cu = m_handler->getCUDAFunctionTable();
-	if (auto err = cu.pcuMemUnmap(mem->ptr, mem->size); CUDA_SUCCESS != err) return err;
-	if (auto err = cu.pcuMemAddressFree(mem->ptr, mem->size); CUDA_SUCCESS != err) return err;
-	if (auto err = cu.pcuMemRelease(mem->memory); CUDA_SUCCESS != err) return err;
-	CloseHandle(mem->osHandle);
-	return CUDA_SUCCESS;
-}
-
-CUresult CCUDADevice::destroyExternalSemaphore(SExternalCUDASemaphore* sema)
-{
-	auto& cu = m_handler->getCUDAFunctionTable();
-	if (auto err = cu.pcuDestroyExternalSemaphore(sema->semaphore); CUDA_SUCCESS != err) return err;
-	CloseHandle(sema->osHandle);
-	return CUDA_SUCCESS;
-}
-
-CUresult CCUDADevice::createExportableMemory(core::smart_refctd_ptr<SSharedCUDAMemory>* outMem, size_t size, size_t alignment)
+CUresult CCUDADevice::createSharedMemory(
+	core::smart_refctd_ptr<CCUDASharedMemory>* outMem, 
+	CCUDASharedMemory::SCreationParams&& inParams)
 {
 	if (!outMem)
 		return CUDA_ERROR_INVALID_VALUE;
 
+	CCUDASharedMemory::SCachedCreationParams params = { inParams };
+
 	auto& cu = m_handler->getCUDAFunctionTable();
 
 	uint32_t metaData[16] = { 48 };
+
 	CUmemAllocationProp prop = {
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32,
-		.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = m_handle },
+		.requestedHandleTypes = ALLOCATION_HANDLE_TYPE,
+		.location = { .type = params.location, .id = m_handle },
 		.win32HandleMetaData = metaData,
 	};
-
-	size_t granularity = 0;
-	if (auto err = cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); CUDA_SUCCESS != err)
-		return err;
-
-	size = ((size - 1) / granularity + 1) * granularity;
 	
-	CUmemGenericAllocationHandle mem = 0;
-	void* handle = 0;
-	CUdeviceptr ptr = 0;
+	params.granularSize = roundToGranularity(params.location, params.size);
 
-	if(auto err = cu.pcuMemCreate(&mem, size, &prop, 0); CUDA_SUCCESS != err)
+	CUmemGenericAllocationHandle mem;
+	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
 		return err;
-
-	if (auto err = cu.pcuMemExportToShareableHandle(&handle, mem, CU_MEM_HANDLE_TYPE_WIN32, 0); CUDA_SUCCESS != err)
+	
+	if (auto err = cu.pcuMemExportToShareableHandle(&params.osHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
 		cu.pcuMemRelease(mem);
 		return err;
 	}
 
-	if (auto err = reserveAdrressAndMapMemory(&ptr, size, alignment, mem); CUDA_SUCCESS != err)
+	if (auto err = reserveAdrressAndMapMemory(&params.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err)
 	{
-		CloseHandle(handle);
+		CloseHandle(params.osHandle);
 		cu.pcuMemRelease(mem);
 		return err;
 	}
 
-	*outMem = core::make_smart_refctd_ptr<SSharedCUDAMemory>(core::smart_refctd_ptr<CCUDADevice>(this), size, ptr, mem, handle);
-
-	return CUDA_SUCCESS;
-}
-
-CCUDADevice::SSharedCUDAMemory::~SSharedCUDAMemory()
-{
-	device->releaseExportableMemory(this);
-}
-
-CCUDADevice::SExternalCUDASemaphore::~SExternalCUDASemaphore()
-{
-	device->destroyExternalSemaphore(this);
-}
-
-core::smart_refctd_ptr<IGPUBuffer> CCUDADevice::exportGPUBuffer(SSharedCUDAMemory* mem, ILogicalDevice* device)
-{
-
-	if (!device || !mem || !mem->memory || !mem->osHandle || !mem->ptr || !mem->size)
-		return nullptr;
-
+	if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err)
 	{
-		CUuuid id;
-		// TODO(Atil): Cache properties
-		if (CUDA_SUCCESS != m_handler->getCUDAFunctionTable().pcuDeviceGetUuid(&id, m_handle))
-			return nullptr;
-
-		if (memcmp(&id, device->getPhysicalDevice()->getProperties().deviceUUID, 16))
-			return nullptr;
-	}
-
-	auto buf = device->createBuffer(IGPUBuffer::SCreationParams {
-		asset::IBuffer::SCreationParams{
-			.size = mem->size, 
-			.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT 
-		},
-		IDeviceMemoryBacked::SCreationParams{ 
-			IDeviceMemoryBacked::SCachedCreationParams{
-				.externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32,
-				.externalHandle = mem->osHandle
-			}
-		}});
-	
-	auto req = buf->getMemoryReqs();
-	req.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	auto allocation = device->allocate(req, buf.get());
-
-	if (!(allocation.memory && allocation.offset != ILogicalDevice::InvalidMemoryOffset))
-		return nullptr;
-
-	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(core::smart_refctd_ptr<SSharedCUDAMemory>(mem)));
-	return buf;
-}
-
-CUresult CCUDADevice::importGPUBuffer(core::smart_refctd_ptr<SSharedCUDAMemory>* outPtr, IGPUBuffer* buf)
-{
-	if (!buf || !outPtr)
-		return CUDA_ERROR_INVALID_VALUE;
-
-	auto& params = buf->getCachedCreationParams();
-
-	if (!params.externalHandleType.value)
-		return CUDA_ERROR_INVALID_VALUE;
-
-	CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
-		.type = static_cast<CUexternalMemoryHandleType>(params.externalHandleType.value),
-		.handle = {.win32 = {.handle = buf->getExternalHandle()}},
-		.size = buf->getMemoryReqs().size,
-	};
-
-	CUmemGenericAllocationHandle mem = 0;
-	CUdeviceptr ptr = 0;
-	void* handle = handleDesc.handle.win32.handle;
-
-	auto& cu = m_handler->getCUDAFunctionTable();
-	if (auto err = cu.pcuMemImportFromShareableHandle(&mem, buf->getExternalHandle(),
-		static_cast<CUmemAllocationHandleType>(params.externalHandleType.value));
-		CUDA_SUCCESS != err)
-		return err;
-
-	if(auto err = reserveAdrressAndMapMemory(&ptr, buf->getSize(), 1u << buf->getMemoryReqs().alignmentLog2, mem))
-	{
-		cu.pcuMemRelease(mem);
+		CloseHandle(params.osHandle);
 		return err;
 	}
-
-	*outPtr = core::make_smart_refctd_ptr<SSharedCUDAMemory>(
-		core::smart_refctd_ptr<CCUDADevice>(this),
-		buf->getSize(), ptr, mem, handle);
 	
-	buf->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(*outPtr));
+	*outMem = core::smart_refctd_ptr<CCUDASharedMemory>(new CCUDASharedMemory(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params)), core::dont_grab);
+
 	return CUDA_SUCCESS;
 }
 
-CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr<SExternalCUDASemaphore>* outPtr, IGPUSemaphore* sema)
+CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr<CCUDASharedSemaphore>* outPtr, ISemaphore* sema)
 {
 	if (!sema || !outPtr)
 		return CUDA_ERROR_INVALID_VALUE;
 
 	auto& cu = m_handler->getCUDAFunctionTable();
-	auto handleType = sema->getCreationParams().externalHandleType.value;
+	auto handleType = sema->getCreationParams().externalHandleTypes;
 	auto handle = sema->getCreationParams().externalHandle;
 
-	if (!handleType || !handle)
+	if (!handleType.hasFlags(ISemaphore::EHT_OPAQUE_WIN32) || !handle)
 		return CUDA_ERROR_INVALID_VALUE;
 
 	CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = {
-		.type = static_cast<CUexternalSemaphoreHandleType>(handleType),
+		.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32,
 		.handle = {.win32 = {.handle = handle }},
 	};
 
@@ -224,120 +149,11 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr<SExternalCUDASem
 	if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err)
 		return err;
 	
-	*outPtr = core::make_smart_refctd_ptr<SExternalCUDASemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), cusema, handle);
-	sema->chainPreDestroyCleanup(std::make_unique<SCUDACleaner>(*outPtr));
-	return CUDA_SUCCESS;
-}
-
-#if 0
-CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink<video::IGPUBuffer>* link, uint32_t flags)
-{
-	assert(link->obj);
-	auto glbuf = static_cast<video::COpenGLBuffer*>(link->obj.get());
-	auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags);
-	if (retval!=CUDA_SUCCESS)
-		link->obj = nullptr;
-	return retval;
-}
-CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink<video::IGPUImage>* link, uint32_t flags)
-{
-	assert(link->obj);
-			
-	auto format = link->obj->getCreationParameters().format;
-	if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format))
-		return CUDA_ERROR_INVALID_IMAGE;
-
-	auto glimg = static_cast<video::COpenGLImage*>(link->obj.get());
-	GLenum target = glimg->getOpenGLTarget();
-	switch (target)
-	{
-		case GL_TEXTURE_2D:
-		case GL_TEXTURE_2D_ARRAY:
-		case GL_TEXTURE_CUBE_MAP:
-		case GL_TEXTURE_3D:
-			break;
-		default:
-			return CUDA_ERROR_INVALID_IMAGE;
-			break;
-	}
-	auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags);
-	if (retval != CUDA_SUCCESS)
-		link->obj = nullptr;
-	return retval;
-}
-
-
-constexpr auto MaxAquireOps = 4096u;
-
-CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink<video::IGPUBuffer>* linksBegin, GraphicsAPIObjLink<video::IGPUBuffer>* linksEnd, CUstream stream, size_t* outbufferSizes)
-{
-	if (linksBegin+MaxAquireOps<linksEnd)
-		return CUDA_ERROR_OUT_OF_MEMORY;
-	alignas(_NBL_SIMD_ALIGNMENT) uint8_t stackScratch[MaxAquireOps*sizeof(void*)];
-
-	CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream);
-	if (result != CUDA_SUCCESS)
-		return result;
-
-	size_t tmp = 0xdeadbeefbadc0ffeull;
-	size_t* sit = outbufferSizes;
-	for (auto iit=linksBegin; iit!=linksEnd; iit++,sit++)
-	{
-		if (!iit->acquired)
-			return CUDA_ERROR_UNKNOWN;
-
-		result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle);
-		if (result != CUDA_SUCCESS)
-			return result;
-	}
-	return CUDA_SUCCESS;
-}
-CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, CUstream stream)
-{
-	if (linksBegin+MaxAquireOps<linksEnd)
-		return CUDA_ERROR_OUT_OF_MEMORY;
-	alignas(_NBL_SIMD_ALIGNMENT) uint8_t stackScratch[MaxAquireOps*sizeof(void*)];
-
-	CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream);
-	if (result != CUDA_SUCCESS)
-		return result;
-
-	for (auto iit=linksBegin; iit!=linksEnd; iit++)
-	{
-		if (!iit->acquired)
-			return CUDA_ERROR_UNKNOWN;
-
-		result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle);
-		if (result != CUDA_SUCCESS)
-			return result;
-	}
+	*outPtr = core::smart_refctd_ptr<CCUDASharedSemaphore>(new CCUDASharedSemaphore(core::smart_refctd_ptr<CCUDADevice>(this), core::smart_refctd_ptr<ISemaphore>(sema), cusema, handle), core::dont_grab);
 	return CUDA_SUCCESS;
 }
-CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream)
-{
-	if (linksBegin+MaxAquireOps<linksEnd)
-		return CUDA_ERROR_OUT_OF_MEMORY;
-	alignas(_NBL_SIMD_ALIGNMENT) uint8_t stackScratch[MaxAquireOps*sizeof(void*)];
 
-	CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream);
-	if (result != CUDA_SUCCESS)
-		return result;
-
-	auto ait = arrayIndices;
-	auto mit = mipLevels;
-	for (auto iit=linksBegin; iit!=linksEnd; iit++,ait++,mit++)
-	{
-		if (!iit->acquired)
-			return CUDA_ERROR_UNKNOWN;
-
-		result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit);
-		if (result != CUDA_SUCCESS)
-			return result;
-	}
-	return CUDA_SUCCESS;
-}
-#endif
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
+#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 183afe6b43..09c2fbe14e 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -3,6 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/video/CCUDAHandler.h"
+#include "nbl/system/CFileView.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "jitify/jitify.hpp"
@@ -410,7 +411,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	NVRTC nvrtc = {};
 	#if defined(_NBL_WINDOWS_API_)
 	// Perpetual TODO: any new CUDA releases we need to account for?
-	const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr };
+	const char* nvrtc64_versions[] = { "nvrtc64_120", "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr };
 	const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr};
 	for (auto verpath=nvrtc64_versions; *verpath; verpath++)
 	{
@@ -468,7 +469,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	{
 		const void* contents = it.second.data();
 		headers.push_back(core::make_smart_refctd_ptr<system::CFileView<system::CNullAllocator>>(
-			core::smart_refctd_ptr<system::ISystem>(system),it.first.c_str(),
+			it.first.c_str(),
 			core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE,
 			const_cast<void*>(contents),it.second.size()+1u
 		));
@@ -514,7 +515,8 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 		return {nullptr,NVRTC_ERROR_INVALID_INPUT};
 
 	auto ptx = core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size);
-	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast<char*>(ptx->getPointer()))};
+	nvrtcResult result = m_nvrtc.pnvrtcGetPTX(prog, reinterpret_cast<char*>(ptx->getPointer()));
+	return {std::move(ptx),result};
 }
 
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
@@ -538,7 +540,8 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 		CUuuid uuid = {};
 		if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS)
 			continue;
-        if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE))
+		
+        if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
 		{
 			int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 			for (int i=0; i<CU_DEVICE_ATTRIBUTE_MAX; i++)
@@ -624,7 +627,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 			if (arch==CCUDADevice::EVA_COUNT)
 				continue;
 
-			auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch);
+			auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,handle,core::smart_refctd_ptr<CCUDAHandler>(this));
             return core::smart_refctd_ptr<CCUDADevice>(device,core::dont_grab);
         }
     }
@@ -633,4 +636,4 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
+#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp
new file mode 100644
index 0000000000..3ebb8e211d
--- /dev/null
+++ b/src/nbl/video/CCUDASharedMemory.cpp
@@ -0,0 +1,109 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/video/CCUDADevice.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+namespace nbl::video
+{
+
+core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
+{
+	IDeviceMemoryAllocator::SAllocateInfo info = {
+		.size = m_params.granularSize,
+		.externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE,
+		.externalHandle = m_params.osHandle,
+	};
+
+	auto pd = device->getPhysicalDevice();
+	uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1;
+	uint32_t vram = pd->getDeviceLocalMemoryTypeBits();
+
+	switch (m_params.location)
+	{
+	case CU_MEM_LOCATION_TYPE_HOST:   memoryTypeBits &= ~vram; break;
+	case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &=  vram; break;
+		// TODO(Atil): Figure out how to handle these
+	case CU_MEM_LOCATION_TYPE_HOST_NUMA:
+	case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT:
+	default: break;
+	}
+
+	IDeviceMemoryBacked::SDeviceMemoryRequirements req = {};
+	req.size = m_params.granularSize;
+	req.memoryTypeBits = memoryTypeBits;
+	req.prefersDedicatedAllocation  = nullptr != dedication;
+	req.requiresDedicatedAllocation = nullptr != dedication;
+
+	return device->allocate(req, 
+		dedication, 
+		IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, 
+		CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, m_params.osHandle, 
+		std::make_unique<CCUDADevice::SCUDACleaner>(core::smart_refctd_ptr<const CCUDASharedMemory>(this))).memory;
+}
+
+#if 0
+core::smart_refctd_ptr<IGPUBuffer> CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> usage) const
+{
+	if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice()))
+		return nullptr;
+
+	auto buf = device->createBuffer({{
+			.size = m_params.granularSize,
+			.usage = usage }, {{
+			.postDestroyCleanup = std::make_unique<CCUDADevice::SCUDACleaner>(core::smart_refctd_ptr<const CCUDASharedMemory>(this)),
+			.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE,
+			.externalHandle = m_params.osHandle
+		}}});
+
+	auto req = buf->getMemoryReqs();
+	auto pd = device->getPhysicalDevice();
+	switch (m_params.location)
+	{
+	case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break;
+	case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break;
+	// TODO(Atil): Figure out how to handle these
+	case CU_MEM_LOCATION_TYPE_HOST_NUMA: 
+	case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: 
+	default: break;
+	}
+
+	if (!device->allocate(req, buf.get()).isValid())
+		return nullptr;
+
+	return buf;
+}
+
+#endif
+
+core::smart_refctd_ptr<IGPUImage>  CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const
+{
+	if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice()))
+		return nullptr;
+
+	auto img = device->createImage({
+		std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }},
+		IGPUImage::TILING::LINEAR,
+		1 /*preinitialized*/,
+	});
+	
+	if (exportAsMemory(device, img.get()))
+		return img;
+	
+	return nullptr;
+}
+
+CCUDASharedMemory::~CCUDASharedMemory()
+{
+	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
+
+	CUresult re[] = {
+		cu.pcuMemUnmap(m_params.ptr, m_params.granularSize),
+	};
+	CloseHandle(m_params.osHandle);
+
+}
+}
+
+#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp
new file mode 100644
index 0000000000..4d6d3aacc9
--- /dev/null
+++ b/src/nbl/video/CCUDASharedSemaphore.cpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/video/CCUDADevice.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+namespace nbl::video
+{
+CCUDASharedSemaphore::~CCUDASharedSemaphore()
+{
+	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
+	cu.pcuDestroyExternalSemaphore(m_handle);
+	CloseHandle(m_osHandle);
+}
+}
+
+#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index 55e4c90dab..ed2e3e0fab 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -210,7 +210,7 @@ bool CVulkanCommandBuffer::pipelineBarrier_impl(const core::bitflag<asset::E_DEP
 
     auto info = fill(memoryBarriers.data(),bufferBarriers.data(),imageBarriers.data(),depInfo,m_cmdpool->getQueueFamilyIndex());
     info.dependencyFlags = static_cast<VkDependencyFlagBits>(dependencyFlags.value);
-    getFunctionTable().vkCmdPipelineBarrier2KHR(m_cmdbuf,&info);
+    getFunctionTable().vkCmdPipelineBarrier2(m_cmdbuf,&info);
     return true;
 }
 
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 7041e4bad7..752645f633 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -45,21 +45,39 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr<const IAPIConn
 }
 
 
-core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue)
+core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(ISemaphore::SCreationParams&& params)
 {
+    VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR };
+    VkExportSemaphoreWin32HandleInfoKHR handleInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL };
+    VkExportSemaphoreCreateInfo  exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, &handleInfo, static_cast<VkExternalSemaphoreHandleTypeFlags>(params.externalHandleTypes.value) };
+
     VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR };
-    type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR
+    type.pNext = params.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR
     type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
-    type.initialValue = initialValue;
+    type.initialValue = params.initialValue;
 
-    VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,&type };
+    VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, &type };
     createInfo.flags = static_cast<VkSemaphoreCreateFlags>(0); // flags must be 0
 
     VkSemaphore semaphore;
-    if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS)
-        return core::make_smart_refctd_ptr<CVulkanSemaphore>(core::smart_refctd_ptr<const CVulkanLogicalDevice>(this),semaphore);
-    else
+    if (VK_SUCCESS != m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore))
         return nullptr;
+
+    if (params.externalHandleTypes.value)
+    {
+        VkSemaphoreGetWin32HandleInfoKHR props = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+            .semaphore = semaphore,
+            .handleType = static_cast<VkExternalSemaphoreHandleTypeFlagBits>(params.externalHandleTypes.value),
+        };
+        if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &params.externalHandle))
+        {
+            m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0);
+            return nullptr;
+        }
+    }
+
+    return core::make_smart_refctd_ptr<CVulkanSemaphore>(core::smart_refctd_ptr<CVulkanLogicalDevice>(this), semaphore, std::move(params));
 }
 auto CVulkanLogicalDevice::waitForSemaphores(const std::span<const SSemaphoreWaitInfo> infos, const bool waitAll, const uint64_t timeout) -> WAIT_RESULT
 {
@@ -138,11 +156,32 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
         vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now
     }
     VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr};
+    VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo };
+    vk_allocateInfo.allocationSize = info.size;
+    vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex;
+
+    VkImportMemoryWin32HandleInfoKHR importInfo = { 
+        .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR,
+        .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(info.externalHandleType),
+        .handle = info.externalHandle
+    };
+
+    const void** pNext = &vk_allocateFlagsInfo.pNext;
+
+    if (info.externalHandleType)
+    {
+        // Importing
+        *pNext = &importInfo;
+        pNext = &importInfo.pNext;
+    }
+
     if(info.dedication)
     {
         // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed
         static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0));
-        vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo;
+        *pNext = &vk_dedicatedInfo;
+        pNext = &vk_dedicatedInfo.pNext;
+
         switch (info.dedication->getObjectType())
         {
             case IDeviceMemoryBacked::EOT_BUFFER:
@@ -157,9 +196,6 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
                 break;
         }
     }
-    VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo};
-    vk_allocateInfo.allocationSize = info.size;
-    vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex;
 
     VkDeviceMemory vk_deviceMemory;
     auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory);
@@ -168,7 +204,17 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
 
     // automatically allocation goes out of scope and frees itself if no success later on
     const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags;
-    ret.memory = core::make_smart_refctd_ptr<CVulkanMemoryAllocation>(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory);
+
+    CVulkanMemoryAllocation::SCreationParams params = {
+        .allocateFlags = allocateFlags,
+        .memoryPropertyFlags = memoryPropertyFlags,
+        .externalHandleType = info.externalHandleType,
+        .externalHandle = info.externalHandle,
+        .dedicated = !!info.dedication,
+        .allocationSize = info.size,
+    };
+    
+    ret.memory = core::make_smart_refctd_ptr<CVulkanMemoryAllocation>(this,vk_deviceMemory, std::move(params));
     ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator
     if(info.dedication)
     {
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index b83a8cc803..d8f934ceb9 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -52,7 +52,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice
             return CVulkanQueue::getResultFrom(m_devf.vk.vkDeviceWaitIdle(m_vkdev));
         }
             
-        core::smart_refctd_ptr<ISemaphore> createSemaphore(const uint64_t initialValue) override;
+        core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&&) override;
         WAIT_RESULT waitForSemaphores(const std::span<const SSemaphoreWaitInfo> infos, const bool waitAll, const uint64_t timeout) override;
             
         core::smart_refctd_ptr<IEvent> createEvent(const IEvent::CREATE_FLAGS flags) override;
diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp
index 5a4dfd5ff5..fb214c897e 100644
--- a/src/nbl/video/CVulkanMemoryAllocation.cpp
+++ b/src/nbl/video/CVulkanMemoryAllocation.cpp
@@ -4,11 +4,15 @@
 namespace nbl::video
 {
 CVulkanMemoryAllocation::CVulkanMemoryAllocation(
-    const CVulkanLogicalDevice* dev, const size_t size,
-    const core::bitflag<E_MEMORY_ALLOCATE_FLAGS> flags,
-    const core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags,
-    const bool isDedicated, const VkDeviceMemory deviceMemoryHandle
-) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {}
+    const CVulkanLogicalDevice* dev,
+    const VkDeviceMemory deviceMemoryHandle,
+    SCreationParams&& params
+) 
+    : IDeviceMemoryAllocation(dev,std::move(params))
+    , m_vulkanDevice(dev)
+    , m_deviceMemoryHandle(deviceMemoryHandle) 
+{
+}
 
 CVulkanMemoryAllocation::~CVulkanMemoryAllocation()
 {
diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h
index 470e914ae3..d9508411b0 100644
--- a/src/nbl/video/CVulkanMemoryAllocation.h
+++ b/src/nbl/video/CVulkanMemoryAllocation.h
@@ -15,10 +15,9 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation
 {
     public:
         CVulkanMemoryAllocation(
-            const CVulkanLogicalDevice* dev, const size_t size,
-            const core::bitflag<E_MEMORY_ALLOCATE_FLAGS> flags,
-            const core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags,
-            const bool isDedicated, const VkDeviceMemory deviceMemoryHandle
+            const CVulkanLogicalDevice* dev,
+            const VkDeviceMemory deviceMemoryHandle, 
+            SCreationParams&& params
         );
 
         inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; }
diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp
index 0f771a41ae..e457ae3a2b 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.cpp
+++ b/src/nbl/video/CVulkanPhysicalDevice.cpp
@@ -1201,6 +1201,9 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
         if (isExtensionSupported(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME))
             properties.limits.cooperativeMatrixRobustness = cooperativeMatrixFeatures.robustness;
 #endif
+        properties.limits.externalFenceWin32 = isExtensionSupported(VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME);
+        properties.limits.externalMemoryWin32 = isExtensionSupported(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
+        properties.limits.externalSemaphoreWin32 = isExtensionSupported(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
     }
 
     // we compare all limits against the defaults easily!
diff --git a/src/nbl/video/CVulkanQueue.cpp b/src/nbl/video/CVulkanQueue.cpp
index 2dd76a47a5..f94a4a8d7d 100644
--- a/src/nbl/video/CVulkanQueue.cpp
+++ b/src/nbl/video/CVulkanQueue.cpp
@@ -84,7 +84,7 @@ auto CVulkanQueue::submit_impl(const std::span<const IQueue::SSubmitInfo> _submi
         outSubmitInfo->signalSemaphoreInfoCount = fillSemaphoreInfo(submit.signalSemaphores,outSignalSemaphoreInfo);
         outSubmitInfo++;
     }
-    const auto vk_result = static_cast<const CVulkanLogicalDevice*>(m_originDevice)->getFunctionTable()->vk.vkQueueSubmit2KHR(m_vkQueue,submits.size(),submits.data(),VK_NULL_HANDLE);
+    const auto vk_result = static_cast<const CVulkanLogicalDevice*>(m_originDevice)->getFunctionTable()->vk.vkQueueSubmit2(m_vkQueue,submits.size(),submits.data(),VK_NULL_HANDLE);
     return getResultFrom(vk_result);
 }
 
diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h
index 9290110d8d..2beb7cb21b 100644
--- a/src/nbl/video/CVulkanSemaphore.h
+++ b/src/nbl/video/CVulkanSemaphore.h
@@ -15,8 +15,11 @@ class ILogicalDevice;
 class CVulkanSemaphore final : public ISemaphore
 {
     public:
-        inline CVulkanSemaphore(core::smart_refctd_ptr<const ILogicalDevice>&& _vkdev, const VkSemaphore semaphore)
-            : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {}
+        inline CVulkanSemaphore(core::smart_refctd_ptr<const ILogicalDevice>&& dev, const VkSemaphore semaphore, SCreationParams&& params = {})
+            : ISemaphore(std::move(dev), std::move(params))
+            , m_semaphore(semaphore)
+        {}
+
         ~CVulkanSemaphore();
 
         uint64_t getCounterValue() const override;
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 4ddb828f39..de6bc6b880 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -164,7 +164,7 @@ bool IGPUCommandBuffer::reset(const core::bitflag<RESET_FLAGS> flags)
 bool IGPUCommandBuffer::end()
 {
     const bool whollyInsideRenderpass = m_recordingFlags.hasFlags(USAGE::RENDER_PASS_CONTINUE_BIT);
-    if (!checkStateBeforeRecording(whollyInsideRenderpass ? queue_flags_t::GRAPHICS_BIT:queue_flags_t::NONE,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE))
+    if (!checkStateBeforeRecording(whollyInsideRenderpass ? queue_flags_t::GRAPHICS_BIT:~queue_flags_t::NONE,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE))
         return false;
 
     m_state = STATE::EXECUTABLE;
@@ -302,7 +302,7 @@ bool IGPUCommandBuffer::waitEvents(const uint32_t eventCount, IEvent* const* con
 
 bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_FLAGS> dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo)
 {
-    if (!checkStateBeforeRecording(/*everything is allowed*/))
+    if (!checkStateBeforeRecording(~queue_flags_t::NONE))
         return false;
 
     if (depInfo.memBarrierCount==0u && depInfo.bufBarrierCount==0u && depInfo.imgBarrierCount==0u)

From bd32f3617632811af2028abf273bf59bcdb6d060 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 4 Jan 2024 18:45:34 +0300
Subject: [PATCH 09/62] point jitify to the right hash

---
 3rdparty/jitify | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/jitify b/3rdparty/jitify
index 0d6dbd8ccd..1a0ca0e837 160000
--- a/3rdparty/jitify
+++ b/3rdparty/jitify
@@ -1 +1 @@
-Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a
+Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96

From b1c5a46ba6e340945cf8e52215aed54dd1418ae8 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Fri, 5 Jan 2024 01:02:16 +0300
Subject: [PATCH 10/62] update examples && use non KHR version of vk functions

---
 examples_tests                         | 2 +-
 src/nbl/video/CVulkanLogicalDevice.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples_tests b/examples_tests
index 6ce21d5c5c..4159025751 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 6ce21d5c5c8026b6772f3e60e21096ee54353a81
+Subproject commit 415902575143a28cba08d677c73f1e917f3367cc
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 752645f633..002dad3ae7 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -99,7 +99,7 @@ auto CVulkanLogicalDevice::waitForSemaphores(const std::span<const SSemaphoreWai
     waitInfo.semaphoreCount = semaphores.size();
     waitInfo.pSemaphores = semaphores.data();
     waitInfo.pValues = values.data();
-    switch (m_devf.vk.vkWaitSemaphoresKHR(m_vkdev,&waitInfo,timeout))
+    switch (m_devf.vk.vkWaitSemaphores(m_vkdev,&waitInfo,timeout))
     {
         case VK_SUCCESS:
             return WAIT_RESULT::SUCCESS;

From 0d365810a3bf1a95079517f5928e4d9a53048b4e Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Thu, 4 Jan 2024 12:03:52 +0100
Subject: [PATCH 11/62] correct bad validations, KHR  instead of coe func usage
 etc.

---
 src/nbl/video/CVulkanCommandBuffer.cpp | 2 +-
 src/nbl/video/CVulkanQueue.cpp         | 2 +-
 src/nbl/video/IGPUCommandBuffer.cpp    | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index ed2e3e0fab..af090c92c3 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -431,7 +431,7 @@ bool CVulkanCommandBuffer::bindDescriptorSets_impl(const asset::E_PIPELINE_BIND_
         }
     }
     // with K slots you need at most (K+1)/2 calls
-    assert(bindCallsCount < (IGPUPipelineLayout::DESCRIPTOR_SET_COUNT-1)/2);
+    assert(bindCallsCount <= (IGPUPipelineLayout::DESCRIPTOR_SET_COUNT+1)/2);
     return true;
 }
 
diff --git a/src/nbl/video/CVulkanQueue.cpp b/src/nbl/video/CVulkanQueue.cpp
index f94a4a8d7d..65c85239a5 100644
--- a/src/nbl/video/CVulkanQueue.cpp
+++ b/src/nbl/video/CVulkanQueue.cpp
@@ -60,7 +60,7 @@ auto CVulkanQueue::submit_impl(const std::span<const IQueue::SSubmitInfo> _submi
     core::vector<VkSubmitInfo2> submits(_submits.size(),{VK_STRUCTURE_TYPE_SUBMIT_INFO_2_KHR,/*No interesting extensions*/nullptr,/*No protected stuff yet*/0});
     core::vector<VkSemaphoreSubmitInfoKHR> waitSemaphores(waitSemCnt);
     core::vector<VkCommandBufferSubmitInfoKHR> commandBuffers(cmdBufCnt,{VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO_KHR,nullptr});
-    core::vector<VkSemaphoreSubmitInfoKHR> signalSemaphores(waitSemCnt);
+    core::vector<VkSemaphoreSubmitInfoKHR> signalSemaphores(signalSemCnt);
 
     auto outSubmitInfo = submits.data();
     auto outWaitSemaphoreInfo = waitSemaphores.data();
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index de6bc6b880..2ede1f2c0f 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -164,7 +164,10 @@ bool IGPUCommandBuffer::reset(const core::bitflag<RESET_FLAGS> flags)
 bool IGPUCommandBuffer::end()
 {
     const bool whollyInsideRenderpass = m_recordingFlags.hasFlags(USAGE::RENDER_PASS_CONTINUE_BIT);
-    if (!checkStateBeforeRecording(whollyInsideRenderpass ? queue_flags_t::GRAPHICS_BIT:~queue_flags_t::NONE,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE))
+    auto allowedQueueCaps = queue_flags_t::GRAPHICS_BIT;
+    if (!whollyInsideRenderpass)
+        allowedQueueCaps |= queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT;
+    if (!checkStateBeforeRecording(allowedQueueCaps,whollyInsideRenderpass ? RENDERPASS_SCOPE::INSIDE:RENDERPASS_SCOPE::OUTSIDE))
         return false;
 
     m_state = STATE::EXECUTABLE;

From 725a984ecb2ae675ac04ff9153fa68860e8938fb Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Thu, 4 Jan 2024 12:09:55 +0100
Subject: [PATCH 12/62] revert a dangerous api change

---
 include/nbl/video/IDeviceMemoryAllocation.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h
index 7074f8861b..64529858ec 100644
--- a/include/nbl/video/IDeviceMemoryAllocation.h
+++ b/include/nbl/video/IDeviceMemoryAllocation.h
@@ -141,14 +141,14 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
             size_t offset = 0ull;
             size_t length = 0ull;
         };
-        inline bool map(const MemoryRange& range, const core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> accessHint=IDeviceMemoryAllocation::EMCAF_READ_AND_WRITE)
+        inline void* map(const MemoryRange& range, const core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> accessHint=IDeviceMemoryAllocation::EMCAF_READ_AND_WRITE)
         {
             if (isCurrentlyMapped())
-                return false;
+                return nullptr;
             if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT))
-                return false;
+                return nullptr;
             if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT))
-                return false;
+                return nullptr;
             m_mappedPtr = reinterpret_cast<uint8_t*>(map_impl(range,accessHint));
             if (m_mappedPtr)
                 m_mappedPtr -= range.offset;

From d2c9382f56af5f7b44541d5f2ab229fcdde0ad49 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Thu, 4 Jan 2024 21:21:42 +0100
Subject: [PATCH 13/62] update examples_tests

---
 3rdparty/dxc/dxc | 2 +-
 examples_tests   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 024c8a9a34..79bf3aa07d 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 024c8a9a349dc45f5b4818c413502e0a45f5d542
+Subproject commit 79bf3aa07d0e603aded9d93b23bf5930d75dd539
diff --git a/examples_tests b/examples_tests
index 4159025751..138356a4a5 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 415902575143a28cba08d677c73f1e917f3367cc
+Subproject commit 138356a4a5e277859c9d156967c0187e45ca8e49

From 2d24604ba86af24a38c65d69ed4d05fef12f2519 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 5 Jan 2024 17:08:16 +0100
Subject: [PATCH 14/62] Disabled CSPIRVIntrospector

---
 include/nbl/asset/utils/CSPIRVIntrospector.h | 75 +++++++++++---------
 src/nbl/asset/utils/CSPIRVIntrospector.cpp   |  3 +
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h
index 926343a1f1..77ad1a4799 100644
--- a/include/nbl/asset/utils/CSPIRVIntrospector.h
+++ b/include/nbl/asset/utils/CSPIRVIntrospector.h
@@ -3,7 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_ASSET_C_SPIRV_INTROSPECTOR_H_INCLUDED_
 #define _NBL_ASSET_C_SPIRV_INTROSPECTOR_H_INCLUDED_
-
+#if 0
 #include "nbl/core/declarations.h"
 
 #include <cstdint>
@@ -27,6 +27,20 @@ namespace spirv_cross
     struct SPIRType;
 }
 
+// podzielic CIntrospectionData na dwie klasy
+// jedna bez inputOutput i bez push constant blocka `CIntrospectionData`
+// druga dziedziczy z pierwszej i dodaje te 2 rzeczy `CStageIntrospectionData`
+
+// wszystkie struktury w CIntrospecionData powininny u�ywa� bit flagi, ozaczaj�cej shader stage (core::unordered_map)
+// CStageIntrospecionData nie powinien u�ywa� bit flagi, ozaczaj�cej shader stage (core::vector)
+
+// hashowane s� tylko set i binding
+// dla spec constant tylko specConstantID
+// validacja kolizji (dla SSpecConstants mo�e si� jedynie r�ni� name)
+// ogarn�� sytuacje gdy jeden descriptor binding ma wi�cej arrayElementCount ni� w SPIR-V
+// w `CStageIntrospectionData` powinien by� trzymana struktura `SIntrospectionParams`
+
+// 
 namespace nbl::asset
 {
 class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
@@ -88,6 +102,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 			//! Sorted by `location`
 			core::vector<SShaderInfoVariant> inputOutput;
 
+			//! Push constants uniform block
 			struct {
 				bool present;
 				core::string name;
@@ -123,7 +138,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 					return false;
 				if (cpuShader->getContent()->getSize() != rhs.cpuShader->getContent()->getSize())
 					return false;
-				return memcmp(cpuShader->getContent()->getPointer(), rhs.cpuShader->getContent()->getPointer(), cpuShader->getContent()->getSize()) == 0;;
+				return memcmp(cpuShader->getContent()->getPointer(), rhs.cpuShader->getContent()->getPointer(), cpuShader->getContent()->getSize()) == 0;
 			}
 		};
 
@@ -132,43 +147,26 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 
 		//! params.cpuShader.contentType should be ECT_SPIRV
 		//! the compiled SPIRV must be compiled with IShaderCompiler::SCompilerOptions::debugInfoFlags enabling EDIF_SOURCE_BIT implicitly or explicitly, with no `spirvOptimizer` used in order to include names in introspection data
+		// powinna zwracac CStageIntrospectionData
 		core::smart_refctd_ptr<const CIntrospectionData> introspect(const SIntrospectionParams& params, bool insertToCache = true);
 
+		// 
+		//core::smart_refctd_ptr<const CIntrospectionData> merge(const std::span<const CStageIntrospectionData>& asdf, const ICPUShader::SSPecInfo::spec_constant_map_t& = {});
+
 		// When the methods take a span of shaders, they are computing things for an imaginary pipeline that includes **all** of them
+		// przeniesc do CIntrospectionData
 		std::pair<bool/*is shadow sampler*/, IImageView<ICPUImage>::E_TYPE> getImageInfoFromIntrospection(uint32_t set, uint32_t binding, const std::span<const ICPUShader::SSpecInfo> _infos);
-		
-		inline core::smart_refctd_dynamic_array<SPushConstantRange> createPushConstantRangesFromIntrospection(const std::span<const ICPUShader::SSpecInfo> _infos)
-		{
-			core::vector<core::smart_refctd_ptr<const CIntrospectionData>> introspections(_infos.size());
-			if (!introspectAllShaders(introspections.data(),_infos))
-				return nullptr;
-
-			return createPushConstantRangesFromIntrospection_impl(introspections.data(),_infos);
-		}
-		inline core::smart_refctd_ptr<ICPUDescriptorSetLayout> createApproximateDescriptorSetLayoutFromIntrospection(uint32_t set, const std::span<const ICPUShader::SSpecInfo> _infos)
-		{
-			core::vector<core::smart_refctd_ptr<const CIntrospectionData>> introspections(_infos.size());
-			if (!introspectAllShaders(introspections.data(),_infos))
-				return nullptr;
-
-			return createApproximateDescriptorSetLayoutFromIntrospection_impl(set,introspections.data(), _infos);
-		}
-		inline core::smart_refctd_ptr<ICPUPipelineLayout> createApproximatePipelineLayoutFromIntrospection(const std::span<const ICPUShader::SSpecInfo> _infos)
-		{
-			core::vector<core::smart_refctd_ptr<const CIntrospectionData>> introspections(_infos.size());
-			if (!introspectAllShaders(introspections.data(),_infos))
-				return nullptr;
-
-			return createApproximatePipelineLayoutFromIntrospection_impl(introspections.data(),_infos);
-		}
 
 		//
 		inline core::smart_refctd_ptr<ICPUComputePipeline> createApproximateComputePipelineFromIntrospection(const ICPUShader::SSpecInfo& info)
+		//TODO: inline core::smart_refctd_ptr<ICPUComputePipeline> createApproximateComputePipelineFromIntrospection(CStageIntrospectionData* asdf)
 		{
 			if (info.shader->getStage()!=IShader::ESS_COMPUTE)
 				return nullptr;
 
 			core::smart_refctd_ptr<const CIntrospectionData> introspection = nullptr;
+			
+			//TODO: zamiast tego mergujemy `CStageIntrospectionData` w `CIntrospectionData` u�ywaj�c `merge`
 			if (!introspectAllShaders(&introspection,{&info,1}))
 				return nullptr;
 
@@ -180,15 +178,24 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 
 		//
 		core::smart_refctd_ptr<ICPURenderpassIndependentPipeline> createApproximateRenderpassIndependentPipelineFromIntrospection(const std::span<const ICPUShader::SSpecInfo> _infos);
+
+		struct CShaderStages
+		{
+			const CStageIntrospectionData* vertex = nullptr;
+			const CStageIntrospectionData* fragment = nullptr;
+			const CStageIntrospectionData* control = nullptr;
+			const CStageIntrospectionData* evaluation = nullptr;
+			const CStageIntrospectionData* geometry = nullptr;
+		}
+		core::smart_refctd_ptr<ICPUGraphicsPipeline> createApproximateGraphicsPipeline(const CShaderStages& shaderStages);
 	
 	private:
-		using mapId2SpecConst_t = core::unordered_map<uint32_t, const CIntrospectionData::SSpecConstant*>;
+		//TODO: przenie�� jako members do CIntrospectionData
+		core::smart_refctd_dynamic_array<SPushConstantRange> createPushConstantRangesFromIntrospection_impl();
+		core::smart_refctd_ptr<ICPUDescriptorSetLayout> createApproximateDescriptorSetLayoutFromIntrospection_impl(const uint32_t setID);
+		core::smart_refctd_ptr<ICPUPipelineLayout> createApproximatePipelineLayoutFromIntrospection_impl();
 
-		core::smart_refctd_dynamic_array<SPushConstantRange> createPushConstantRangesFromIntrospection_impl(core::smart_refctd_ptr<const CIntrospectionData>* const introspections, const std::span<const ICPUShader::SSpecInfo> shaders);
-		core::smart_refctd_ptr<ICPUDescriptorSetLayout> createApproximateDescriptorSetLayoutFromIntrospection_impl(uint32_t _set, core::smart_refctd_ptr<const CIntrospectionData>* const introspections, const std::span<const ICPUShader::SSpecInfo> shaders);
-		core::smart_refctd_ptr<ICPUPipelineLayout> createApproximatePipelineLayoutFromIntrospection_impl(core::smart_refctd_ptr<const CIntrospectionData>* const introspections, const std::span<const ICPUShader::SSpecInfo> shaders);
-
-		bool introspectAllShaders(core::smart_refctd_ptr<const CIntrospectionData>* introspection, const std::span<const ICPUShader::SSpecInfo> _infos);
+		core::smart_refctd_ptr<CStageIntrospectionData> introspectShader(const ICPUShader::SSpecInfo _infos);
 
 		core::smart_refctd_ptr<const CIntrospectionData> doIntrospection(spirv_cross::Compiler& _comp, const std::string& entryPoint, const IShader::E_SHADER_STAGE stage) const;
 		void shaderMemBlockIntrospection(spirv_cross::Compiler& _comp, impl::SShaderMemoryBlock& _res, uint32_t _blockBaseTypeID, uint32_t _varID, const mapId2SpecConst_t& _sortedId2sconst) const;
@@ -213,9 +220,11 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 		};
 
 		using ParamsToDataMap = core::unordered_map<SIntrospectionParams,core::smart_refctd_ptr<const CIntrospectionData>, KeyHasher>;
+		// using ParamsToDataMap = core::unordered_set<core::smart_refctd_ptr<const CStageIntrospectionData>, KeyHasher, KeyEquals>;
 		ParamsToDataMap m_introspectionCache;
 };
 
 } // nbl::asset
 
 #endif
+#endif
diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
index 4378661ba0..ed4b9e3634 100644
--- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp
+++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
@@ -1,6 +1,7 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#if 0
 #include "nbl/asset/utils/CSPIRVIntrospector.h"
 #include "nbl/asset/utils/spvUtils.h"
 
@@ -798,3 +799,5 @@ CSPIRVIntrospector::CIntrospectionData::~CIntrospectionData()
 
 
 } // nbl:asset
+
+#endif

From 2114e50dabf7b89410c2a753df806070a9a31b1e Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 5 Jan 2024 17:59:20 +0100
Subject: [PATCH 15/62] small fixes

---
 include/nbl/video/IPhysicalDevice.h | 10 +++++-----
 src/nbl/CMakeLists.txt              | 10 ++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h
index f29bf4938d..583c8ac9d0 100644
--- a/include/nbl/video/IPhysicalDevice.h
+++ b/include/nbl/video/IPhysicalDevice.h
@@ -559,11 +559,11 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
                     if (videoDecodeDPB && !other.videoDecodeDPB) return false;
                     if (videoEncodeInput && !other.videoEncodeInput) return false;
                     if (videoEncodeDPB && !other.videoEncodeDPB) return false;
-                    if (other.storageImageLoadWithoutFormat && !storageImageLoadWithoutFormat) return false;
-                    if (other.storageImageStoreWithoutFormat && !storageImageStoreWithoutFormat) return false;
-                    if (other.depthCompareSampledImage && !depthCompareSampledImage) return false;
-                    if (other.hostImageTransfer && !hostImageTransfer) return false;
-                    if (other.log2MaxSamples < log2MaxSamples) return false;
+                    if (storageImageLoadWithoutFormat && !other.storageImageLoadWithoutFormat) return false;
+                    if (storageImageStoreWithoutFormat && !other.storageImageStoreWithoutFormat) return false;
+                    if (depthCompareSampledImage && !other.depthCompareSampledImage) return false;
+                    if (hostImageTransfer && !other.hostImageTransfer) return false;
+                    if (log2MaxSamples > other.log2MaxSamples) return false;
                     return true;
                 }
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index dc9f4e7bef..8297a09692 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -83,15 +83,13 @@ include(common)
 
 #[[ Loaders and writers compile options available to edit by user
 	All revelant _NBL_COMPILE_WITH will be there]]
-option(_NBL_COMPILE_WITH_MTL_LOADER_ "Compile with MTL Loader" ON)
-option(_NBL_COMPILE_WITH_OBJ_LOADER_ "Compile with OBJ Loader" ON)
+option(_NBL_COMPILE_WITH_MTL_LOADER_ "Compile with MTL Loader" OFF) #default off until Material Compiler 2
+option(_NBL_COMPILE_WITH_OBJ_LOADER_ "Compile with OBJ Loader" OFF) #default off until Material Compiler 2
 #option(_NBL_COMPILE_WITH_OBJ_WRITER_ "Compile with OBJ Writer" ON) uncomment when writer exists
-option(_NBL_COMPILE_WITH_STL_LOADER_ "Compile with STL Loader" ON)
+option(_NBL_COMPILE_WITH_STL_LOADER_ "Compile with STL Loader" OFF) #default off until Material Compiler 2
 option(_NBL_COMPILE_WITH_STL_WRITER_ "Compile with STL Writer" ON)
-option(_NBL_COMPILE_WITH_PLY_LOADER_ "Compile with PLY Loader" ON)
+option(_NBL_COMPILE_WITH_PLY_LOADER_ "Compile with PLY Loader" OFF) #default off until Material Compiler 2
 option(_NBL_COMPILE_WITH_PLY_WRITER_ "Compile with PLY Writer" ON)
-option(_NBL_COMPILE_WITH_BAW_LOADER_ "Compile with BAW Loader" OFF)
-option(_NBL_COMPILE_WITH_BAW_WRITER_ "Compile with BAW Writer" OFF)
 option(_NBL_COMPILE_WITH_JPG_LOADER_ "Compile with JPG Loader" ON)
 option(_NBL_COMPILE_WITH_JPG_WRITER_ "Compile with JPG Writer" ON)
 option(_NBL_COMPILE_WITH_PNG_LOADER_ "Compile with PNG Loader" ON)

From f6320ce3eeebd1684e3212b6f7f3997063968cde Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sat, 6 Jan 2024 21:44:01 +0100
Subject: [PATCH 16/62] remove unused cruft

---
 include/nbl/asset/IAssetManager.h             |   4 +-
 include/nbl/core/SingleEventHandler.h         | 189 ------------------
 include/nbl/core/declarations.h               |   1 -
 .../nbl/video/utilities/ICommandPoolCache.h   |  10 +-
 src/nbl/video/utilities/ICommandPoolCache.cpp |   6 +-
 5 files changed, 9 insertions(+), 201 deletions(-)
 delete mode 100644 include/nbl/core/SingleEventHandler.h

diff --git a/include/nbl/asset/IAssetManager.h b/include/nbl/asset/IAssetManager.h
index f5c49e264b..572acfa2b3 100644
--- a/include/nbl/asset/IAssetManager.h
+++ b/include/nbl/asset/IAssetManager.h
@@ -45,7 +45,7 @@ std::function<void(SAssetBundle&)> makeAssetDisposeFunc(const IAssetManager* con
 	@see IAsset
 
 */
-class NBL_API2 IAssetManager : public core::IReferenceCounted, public core::QuitSignalling
+class NBL_API2 IAssetManager : public core::IReferenceCounted
 {
         // the point of those functions is that lambdas returned by them "inherits" friendship
         friend std::function<void(SAssetBundle&)> makeAssetGreetFunc(const IAssetManager* const _mgr);
@@ -150,8 +150,6 @@ class NBL_API2 IAssetManager : public core::IReferenceCounted, public core::Quit
     protected:
 		virtual ~IAssetManager()
 		{
-            quitEventHandler.execute();
-
 			for (size_t i = 0u; i < m_assetCache.size(); ++i)
 				if (m_assetCache[i])
 					delete m_assetCache[i];
diff --git a/include/nbl/core/SingleEventHandler.h b/include/nbl/core/SingleEventHandler.h
deleted file mode 100644
index 2bd7440c31..0000000000
--- a/include/nbl/core/SingleEventHandler.h
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_CORE_CORE_SINGLE_EVENT_HANDLER_H__
-#define __NBL_CORE_CORE_SINGLE_EVENT_HANDLER_H__
-
-
-#include "nbl/core/decl/Types.h"
-
-namespace nbl::core
-{
-
-// TODO: actually implement and test
-//#define NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY
-class SingleEventHandler
-{
-    public:
-        using Function = std::function<void()>;
-
-    protected:
-        using FunctionContainerType = core::forward_list<Function>;
-        using FunctionContainerIt = typename FunctionContainerType::iterator;
-
-        bool                    mExecuteOnDestroy;
-		uint32_t				mFunctionsCount;
-        FunctionContainerType   mFunctions;
-        FunctionContainerIt     mLastFunction;
-#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY
-        // returns prev and 
-        inline std::pair<FunctionContainerIt,FunctionContainerIt> findFunction(const Function& function)
-        {
-            auto prev = mFunctions.before_begin();
-            auto curr = mFunctions.begin();
-            while (prev!=mLastFunction)
-            {
-                if (*curr==function)
-                    break;
-                prev = curr++;
-            }
-            return {prev,curr};
-        }
-#endif
-    public:
-        SingleEventHandler(bool executeEventsOnDestroy) : mExecuteOnDestroy(executeEventsOnDestroy), mFunctionsCount(0u)
-        {
-            mLastFunction = mFunctions.before_begin();
-        }
-
-        virtual ~SingleEventHandler()
-        {
-            if (mExecuteOnDestroy)
-            for (auto& func : mFunctions)
-                func();
-        }
-
-        //
-        inline auto getFunctionCount() const { return mFunctionsCount; }
-
-        //
-        inline void registerFunction(Function&& function)
-        {
-            mLastFunction = mFunctions.emplace_after(mLastFunction,std::forward<Function>(function));
-            mFunctionsCount++;
-        }
-#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY
- // no comparison operator for std::function<> so no find
-        //! does not call the operator()
-        inline void deregisterFunction(const Function& function)
-        {
-            auto found = findFunction(function);
-            if (found.first!=mLastFunction)
-            {
-                if (found.second==mLastFunction)
-                    mLastFunction = found.first;
-                mFunctions.erase_after(found.first);
-            }
-        }
-
-        inline void swapFunctions(const Function& oldFunction, Function&& newFunction)
-        {
-            auto found = findFunction(oldFunction);
-            if (found.second!=mFunctions.end())
-                found.second->swap(newFunction);
-        }
-#endif
-        //
-        inline void execute()
-        {
-            for (auto& func : mFunctions)
-                func();
-            mFunctionsCount = 0u;
-            mFunctions.clear();
-            mLastFunction = mFunctions.before_begin();
-        }
-};
-
-//
-class QuitSignalling
-{
-    public:
-        inline void registerOnQuit(SingleEventHandler::Function&& function)
-        {
-            quitEventHandler.registerFunction(std::move(function));
-        }
-#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY
-        //! does not call the operator()
-        inline void deregisterOnQuit(const SingleEventHandler::Function& function)
-        {
-            quitEventHandler.deregisterFunction(function);
-        }
-#endif
-    protected:
-        QuitSignalling() : quitEventHandler(false) {}
-        virtual ~QuitSignalling() {assert(!quitEventHandler.getFunctionCount());}
-
-        SingleEventHandler quitEventHandler;
-};
-
-#ifdef NBL_EVENT_DEREGISTER_IMPLEMENTATION_READY
-//
-template<class T>
-class FactoryAndStaticSafeST
-{
-        T data = {};
-        QuitSignalling* factory = nullptr;
-
-    protected:
-        virtual void preemptiveDestruction()
-        {
-            data = T();
-            factory = nullptr;
-        }
-
-    public:
-        FactoryAndStaticSafeST() = default;
-        ~FactoryAndStaticSafeST()
-        {
-            assert(!factory);
-        }
-
-        T& getData(QuitSignalling* _factory)
-        {
-            if (_factory!=factory)
-            {
-                std::function<void()> func(preemptiveDestruction);
-                if (factory)
-                    factory->deregisterOnQuit(func);
-                _factory->registerOnQuit(std::move(func));
-                factory = _factory;
-            }
-            return data;
-        }
-};
-
-//
-template<class T, class Lockable=std::mutex>
-class FactoryAndStaticSafeMT : protected FactoryAndStaticSafeST<T>
-{
-        static_assert(std::is_standard_layout<Lockable>::value, "Lock class is not standard layout");
-        Lockable lock;
-
-    protected:
-        inline void preemptiveDestruction() override
-        {
-            lock.lock();
-            FactoryAndStaticSafeST<T>::preemptiveDestruction();
-            lock.unlock();
-        }
-
-    public:
-        FactoryAndStaticSafeMT() = default;
-        ~FactoryAndStaticSafeMT() {}
-        
-        std::pair<T&,std::unique_lock<Lockable>> getData(QuitSignalling* _factory)
-        {
-            std::unique_lock lockFirst(lock);
-            return {FactoryAndStaticSafeST<T>::getData(),std::move(lockFirst)};
-        }
-};
-#endif
-
-}
-
-#endif
-
-
-
-
diff --git a/include/nbl/core/declarations.h b/include/nbl/core/declarations.h
index ea5f4167c2..fa9ebe2b18 100644
--- a/include/nbl/core/declarations.h
+++ b/include/nbl/core/declarations.h
@@ -66,7 +66,6 @@
 #include "nbl/core/util/to_underlying.h"
 
 // other useful things
-#include "nbl/core/SingleEventHandler.h"
 #include "nbl/core/EventDeferredHandler.h"
 #include "nbl/core/IBuffer.h"
 #include "nbl/core/IReferenceCounted.h"
diff --git a/include/nbl/video/utilities/ICommandPoolCache.h b/include/nbl/video/utilities/ICommandPoolCache.h
index 2d2e8b8df2..f86ebde930 100644
--- a/include/nbl/video/utilities/ICommandPoolCache.h
+++ b/include/nbl/video/utilities/ICommandPoolCache.h
@@ -13,13 +13,12 @@
 namespace nbl::video
 {
 
-#if 0 // TODO: port
 class ICommandPoolCache : public core::IReferenceCounted
 {
 	public:
 		using CommandPoolAllocator = core::PoolAddressAllocatorST<uint32_t>;
 
-		NBL_API2 ICommandPoolCache(ILogicalDevice* device, const uint32_t queueFamilyIx, const IGPUCommandPool::CREATE_FLAGS _flags, const uint32_t capacity);
+		NBL_API2 ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag<IGPUCommandPool::CREATE_FLAGS> _flags, const uint32_t capacity);
 
 		//
 		inline uint32_t getCapacity() const {return m_cmdPoolAllocator.get_total_size();}
@@ -33,6 +32,7 @@ class ICommandPoolCache : public core::IReferenceCounted
 			return nullptr;
 		}
 
+#if 0 // TODO: port
 		//
 		inline uint32_t acquirePool()
 		{
@@ -106,12 +106,13 @@ class ICommandPoolCache : public core::IReferenceCounted
 
 				NBL_API2 void operator()();
 		};
+#endif
 
 	protected:
 		friend class DeferredCommandPoolResetter;
 		inline virtual ~ICommandPoolCache()
 		{
-			m_deferredResets.cullEvents(0u);
+//			m_deferredResets.cullEvents(0u);
 			free(m_reserved);
 			delete[] m_cache;
 		}
@@ -121,9 +122,8 @@ class ICommandPoolCache : public core::IReferenceCounted
 		core::smart_refctd_ptr<IGPUCommandPool>* m_cache;
 		void* m_reserved;
 		CommandPoolAllocator m_cmdPoolAllocator;
-		GPUDeferredEventHandlerST<DeferredCommandPoolResetter> m_deferredResets;
+//		GPUDeferredEventHandlerST<DeferredCommandPoolResetter> m_deferredResets;
 };
-#endif
 
 }
 
diff --git a/src/nbl/video/utilities/ICommandPoolCache.cpp b/src/nbl/video/utilities/ICommandPoolCache.cpp
index e635911fdb..4c38fb5dec 100644
--- a/src/nbl/video/utilities/ICommandPoolCache.cpp
+++ b/src/nbl/video/utilities/ICommandPoolCache.cpp
@@ -6,9 +6,8 @@ using namespace nbl;
 using namespace video;
 
 
-#if 0 // TODO: port
-ICommandPoolCache::ICommandPoolCache(ILogicalDevice* device, const uint32_t queueFamilyIx, const ICommandPool::CREATE_FLAGS _flags, const uint32_t capacity)
-	: m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u), m_deferredResets()
+ICommandPoolCache::ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag<IGPUCommandPool::CREATE_FLAGS> _flags, const uint32_t capacity)
+	: m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u)//, m_deferredResets()
 {
 	m_cache = new core::smart_refctd_ptr<IGPUCommandPool>[capacity];
 	for (auto i=0u; i<getCapacity(); i++)
@@ -21,6 +20,7 @@ void ICommandPoolCache::releaseSet(const uint32_t poolIx)
 	m_cmdPoolAllocator.free_addr(poolIx,1);
 }
 
+#if 0
 void ICommandPoolCache::DeferredCommandPoolResetter::operator()()
 {
 	#ifdef _NBL_DEBUG

From f749ab8045817cf068fc955c6ca4de28c47805e2 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sun, 7 Jan 2024 01:08:41 +0100
Subject: [PATCH 17/62] draft

---
 include/nbl/video/ISemaphore.h | 229 ++++++++++++++++++++++++++++++++-
 src/nbl/CMakeLists.txt         |   1 +
 src/nbl/video/IGPUFence.cpp    |  18 ---
 src/nbl/video/ISemaphore.cpp   |  21 +++
 4 files changed, 250 insertions(+), 19 deletions(-)
 delete mode 100644 src/nbl/video/IGPUFence.cpp
 create mode 100644 src/nbl/video/ISemaphore.cpp

diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h
index 768fe1a66d..86c112e555 100644
--- a/include/nbl/video/ISemaphore.h
+++ b/include/nbl/video/ISemaphore.h
@@ -70,6 +70,233 @@ class ISemaphore : public IBackendObject
         SCreationParams m_creationParams;
 };
 
-}
+class NBL_API2 TimelineEventHandlerBase : core::Unmovable, core::Uncopyable
+{
+    public:
+        // little utility
+        inline ISemaphore* getSemaphore() const {return m_sema.get();}
+
+    protected:
+        inline TimelineEventHandlerBase(core::smart_refctd_ptr<ISemaphore>&& sema) : m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()) {}
+        
+        template<class Clock, class Duration=typename Clock::duration>
+        bool singleSemaphoreWait(const uint64_t value, const std::chrono::time_point<Clock,Duration>& timeout_time)
+        {
+            const auto current_time = Clock::now();
+            if (timeout_time>current_time && notTimedOut(value,std::chrono::duration_cast<std::chrono::nanoseconds>(timeout_time-current_time).count());
+                return value; // we return it even on device loss or error, as to not hang up blocks for completion
+            return m_sema->getCounterValue();
+        }
+
+        bool notTimedOut(const uint64_t value, const uint64_t nanoseconds);
+
+        core::smart_refctd_ptr<ISemaphore> m_sema;
+        uint64_t m_greatestSignal;
+        uint64_t m_greatestLatch;
+};
+
+#if 0
+// Could be quite easily made MT and relatively lockless, if only had a good lock-poor circular buffer impl
+template<typename Functor>
+class TimelineEventHandlerST final : public TimelineEventHandlerBase
+{
+        constexpr static inline bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()()),bool>;
+        struct FunctorValuePair
+        {
+            Functor func;
+            uint64_t geSemaValue;
+        };
+        // could be a circular buffer but whatever
+        core::deque<FunctorValuePair> m_cb;
+        
+        inline uint32_t resetLatch()
+        {
+            m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue;
+            return m_cb.size();
+        }
+
+    public:
+        inline TimelineEventHandlerST(core::smart_refctd_ptr<ISemaphore>&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) :
+            TimelineEventHandlerBase(std::move(sema)), m_cb(initialCapacity)
+        {
+            resetLatch();
+        }
+        ~TimelineEventHandlerST()
+        {
+            while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {}
+        }
+
+        inline uint32_t count() const {return m_cb.size();}
+
+        // You can latch arbitrary functors upon the semaphore reaching some value
+        inline void latch(const uint64_t geSemaValue, Functor&& function)
+        {
+            //const auto oldValue = core::atomic_fetch_max(&m_greatestLatch,geSemaValue);
+            assert(geSemaValue>=m_greatestLatch); // you cannot latch out of order
+            m_greatestLatch = geSemaValue;
+            m_cb.emplace_back(std::move(function),geSemaValue);
+        }
+
+        // Returns number of events still outstanding
+        inline uint32_t poll(bool& bailed)
+        {
+            m_greatestSignal = m_sema->getCounterValue();
+            // in a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal
+            while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal)
+            {
+                bailed = false;
+                if constexpr (ReturnsBool)
+                    bailed = m_cb.front().func();
+                m_cb.pop_front();
+                if (bailed)
+                    break;
+            }
+            return resetLatch();
+        }
+        inline uint32_t poll()
+        {
+            bool dummy;
+            return poll(dummy);
+        }
+
+        template<class Clock, class Duration=typename Clock::duration>
+        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
+        {
+            if constexpr (ReturnsBool)
+            {
+                // Perf-assumption: there are no latched events with wait values less or equal to m_greatestSignal
+                // So we have a bunch of events with semaphore values between m_greatestSignal and m_greatestLatch
+#if 0
+                for (std::chrono::time_point<Clock, Duration> currentClockTime; (currentClockTime = Clock::now()) < timeout_time; )
+                while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal)
+                {
+                    const bool bail = m_cb.front().func();
+                    m_cb.pop_front();
+                    if (bail)
+                        return resetLatch();
+                }
+#endif
+            }
+            else
+            {
+                m_greatestSignal = singleSemaphoreWait(m_greatestLatch,timeout_time);
+                while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal)
+                {
+                    m_cb.front().func();
+                    m_cb.pop_front();
+                }
+            }
+            return resetLatch();
+        }
 
+        // The default behaviour of the underlying event handler is to wait for all events in its destructor.
+        // This will naturally cause you problems if you add functions latched on values you never signal,
+        // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock.
+        inline uint32_t abortOldest(const uint64_t upTo=~0ull)
+        {
+            m_greatestSignal = m_sema->getCounterValue();
+            while (!m_cb.empty() && m_cb.front().geSemaValue<=upTo)
+            {
+                // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady())
+                if (m_cb.front().geSemaValue<= m_greatestSignal)
+                    m_cb.front().func();
+                m_cb.pop_front();
+            }
+            return resetLatch();
+        }
+        inline uint32_t abortLatest(const uint64_t from=0ull)
+        {
+            m_greatestSignal = m_sema->getCounterValue();
+            while (!m_cb.empty() && m_cb.back().geSemaValue>=from)
+            {
+                // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady())
+                if (m_cb.back().geSemaValue<= m_greatestSignal)
+                    m_cb.back().func();
+                m_cb.pop_back();
+            }
+            return resetLatch();
+        }
+};
+
+template<typename Functor>
+class MultiTimelineEventHandlerST final
+{
+    public:
+        inline ~MultiTimelineEventHandlerST()
+        {
+            for (auto p : m_timelines)
+                delete p;
+        }
+
+        inline const auto& getTimelines() const {return m_timelines;}
+
+        // all the members are counteparts of the single timeline version
+        inline uint32_t count() const
+        {
+            uint32_t sum = 0;
+            for (auto p : m_timelines)
+                sum += p->count();
+            return sum;
+        }
+
+        inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function)
+        {
+            auto found = m_timelines.find(sema);
+            if (found==m_timelines.end())
+                found m_timelines.insert(found,new TimelineEventHandlerST(core::smart_refctd_ptr<ISemaphore>(sema)));
+            assert((*found)->getSemaphore()==sema);
+            found->latch(sema,geValue,std::move(function));
+        }
+
+        inline uint32_t poll()
+        {
+            uint32_t sum = 0;
+            for (auto p : m_timelines)
+            {
+                bool bailed;
+                p->poll(bailed);
+                if (bailed)
+                    break;
+            }
+            return sum;
+        }
+        template<class Clock, class Duration=typename Clock::duration>
+        inline uint32_t wait(const std::chrono::time_point<Clock, Duration>& timeout_time)
+        {
+            // want to give each event equal wait time, so interpolate (albeit weirdly)
+            return 455;
+        }
+
+        inline uint32_t abortOldest(const uint64_t upTo=~0ull)
+        {
+            uint32_t sum = 0;
+            for (auto p : m_timelines)
+                sum += p->abortOldest(upTo);
+            return sum;
+        }
+        inline uint32_t abortLatest(const uint64_t from=0ull)
+        {
+            uint32_t sum = 0;
+            for (auto p : m_timelines)
+                sum += p->abortLatest(from);
+            return sum;
+        }
+
+    private:
+        struct Compare
+        {
+            inline bool operator()(const TimelineEventHandlerST* lhs, const TimelineEventHandlerST* rhs) const
+            {
+                return lhs->getSemaphore()<rhs->getSemaphore();
+            }
+            inline bool operator()(const TimelineEventHandlerST* lhs, const ISemaphore* rhs) const
+            {
+                return lhs->getSemaphore()<rhs;
+            }
+        };
+        core::set<TimelineEventHandlerST*,Compare> m_timelines;
+};
+#endif
+
+}
 #endif
\ No newline at end of file
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 8297a09692..7c7719052e 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -275,6 +275,7 @@ set(NBL_VIDEO_SOURCES
 	${NBL_ROOT_PATH}/src/nbl/video/ILogicalDevice.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IGPUAccelerationStructure.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IGPUCommandBuffer.cpp
+	${NBL_ROOT_PATH}/src/nbl/video/ISemaphore.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IQueue.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IGPUDescriptorSet.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IDeviceMemoryAllocation.cpp
diff --git a/src/nbl/video/IGPUFence.cpp b/src/nbl/video/IGPUFence.cpp
deleted file mode 100644
index 8104ed3313..0000000000
--- a/src/nbl/video/IGPUFence.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "nbl/video/IGPUFence.h"
-#include "nbl/video/ILogicalDevice.h"
-#include "nbl/video/IPhysicalDevice.h"
-
-namespace nbl::video
-{
-
-IGPUFence::E_STATUS GPUEventWrapper::waitFenceWrapper(IGPUFence* fence, uint64_t timeout)
-{
-    return mDevice->waitForFences(1u,&fence,true,timeout);
-}
-
-IGPUFence::E_STATUS GPUEventWrapper::getFenceStatusWrapper(IGPUFence* fence)
-{
-    return mDevice->getFenceStatus(fence);
-}
-
-}
\ No newline at end of file
diff --git a/src/nbl/video/ISemaphore.cpp b/src/nbl/video/ISemaphore.cpp
new file mode 100644
index 0000000000..18eca04e5a
--- /dev/null
+++ b/src/nbl/video/ISemaphore.cpp
@@ -0,0 +1,21 @@
+#include "nbl/video/ISemaphore.h"
+#include "nbl/video/ILogicalDevice.h"
+
+namespace nbl::video
+{
+
+bool TimelineEventHandlerBase::notTimedOut(const uint64_t value, const uint64_t nanoseconds)
+{
+    const ILogicalDevice::SSemaphoreWaitInfo info = {.semaphore=m_sema.get(),.value=value};
+    switch (const_cast<ILogicalDevice*>(m_sema->getOriginDevice())->waitForSemaphores({&info,1},true,nanoseconds))
+    {
+        case ILogicalDevice::WAIT_RESULT::TIMEOUT:
+            return false;
+            break;
+        default: 
+            break;
+    }
+    return true;
+}
+
+}
\ No newline at end of file

From ad1e6ffdde69a2149836a9f40cbb0417890f6d31 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 16:45:10 +0100
Subject: [PATCH 18/62] move the TimelineEventHandlers to their own header,
 simplifying everything, also move the Semaphore wait structures and codes to
 ISemaphore

---
 include/nbl/video/ILogicalDevice.h        |  26 +-
 include/nbl/video/ISemaphore.h            | 246 ++--------------
 include/nbl/video/TimelineEventHandlers.h | 325 ++++++++++++++++++++++
 src/nbl/CMakeLists.txt                    |   1 -
 src/nbl/video/CVulkanLogicalDevice.cpp    |  14 +-
 src/nbl/video/CVulkanLogicalDevice.h      |   2 +-
 src/nbl/video/ISemaphore.cpp              |  21 --
 7 files changed, 359 insertions(+), 276 deletions(-)
 create mode 100644 include/nbl/video/TimelineEventHandlers.h
 delete mode 100644 src/nbl/video/ISemaphore.cpp

diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index d4cdc6fd99..a56c311f4e 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -147,32 +147,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual IQueue::RESULT waitIdle() const = 0;
 
         //! Semaphore Stuff
-        virtual core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&& ) = 0;
-        //
-        struct SSemaphoreWaitInfo
-        {
-            const ISemaphore* semaphore;
-            uint64_t value;
-        };
-        enum class WAIT_RESULT : uint8_t
-        {
-            TIMEOUT,
-            SUCCESS,
-            DEVICE_LOST,
-            _ERROR
-        };
-        virtual WAIT_RESULT waitForSemaphores(const std::span<const SSemaphoreWaitInfo> infos, const bool waitAll, const uint64_t timeout) = 0;
+        virtual core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&&) = 0;
+        virtual ISemaphore::WAIT_RESULT waitForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll, const uint64_t timeout) = 0;
         // Forever waiting variant if you're confident that the fence will eventually be signalled
-        inline WAIT_RESULT blockForSemaphores(const std::span<const SSemaphoreWaitInfo> infos, const bool waitAll=true)
+        inline ISemaphore::WAIT_RESULT blockForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll=true)
         {
+            using retval_t = ISemaphore::WAIT_RESULT;
             if (!infos.empty())
             {
-                auto waitStatus = WAIT_RESULT::TIMEOUT;
-                while (waitStatus==WAIT_RESULT::TIMEOUT)
+                auto waitStatus = retval_t::TIMEOUT;
+                while (waitStatus== retval_t::TIMEOUT)
                     waitStatus = waitForSemaphores(infos,waitAll,999999999ull);
                 return waitStatus;
             }
-            return WAIT_RESULT::SUCCESS;
+            return retval_t::SUCCESS;
         }
 
         //! Event Stuff
diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h
index 86c112e555..f16fa86baf 100644
--- a/include/nbl/video/ISemaphore.h
+++ b/include/nbl/video/ISemaphore.h
@@ -4,6 +4,8 @@
 
 #include "nbl/core/IReferenceCounted.h"
 
+#include <chrono>
+
 #include "nbl/video/decl/IBackendObject.h"
 
 
@@ -13,6 +15,7 @@ namespace nbl::video
 class ISemaphore : public IBackendObject
 {
     public:
+        // basically a pool function
         virtual uint64_t getCounterValue() const = 0;
 
         //! Basically the counter can only monotonically increase with time (ergo the "timeline"):
@@ -23,6 +26,21 @@ class ISemaphore : public IBackendObject
         // without any execution dependencies, you can only signal a value higher than 2 but less than 3 which is impossible.
         virtual void signal(const uint64_t value) = 0;
 
+        // We don't provide waits as part of the semaphore (cause you can await multiple at once with ILogicalDevice),
+        // but don't want to pollute ILogicalDevice with lots of enums and structs
+        struct SWaitInfo
+        {
+            const ISemaphore* semaphore;
+            uint64_t value;
+        };
+        enum class WAIT_RESULT : uint8_t
+        {
+            TIMEOUT,
+            SUCCESS,
+            DEVICE_LOST,
+            _ERROR
+        };
+
         // Vulkan: const VkSemaphore*
         virtual const void* getNativeHandle() const = 0;
 
@@ -70,233 +88,5 @@ class ISemaphore : public IBackendObject
         SCreationParams m_creationParams;
 };
 
-class NBL_API2 TimelineEventHandlerBase : core::Unmovable, core::Uncopyable
-{
-    public:
-        // little utility
-        inline ISemaphore* getSemaphore() const {return m_sema.get();}
-
-    protected:
-        inline TimelineEventHandlerBase(core::smart_refctd_ptr<ISemaphore>&& sema) : m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()) {}
-        
-        template<class Clock, class Duration=typename Clock::duration>
-        bool singleSemaphoreWait(const uint64_t value, const std::chrono::time_point<Clock,Duration>& timeout_time)
-        {
-            const auto current_time = Clock::now();
-            if (timeout_time>current_time && notTimedOut(value,std::chrono::duration_cast<std::chrono::nanoseconds>(timeout_time-current_time).count());
-                return value; // we return it even on device loss or error, as to not hang up blocks for completion
-            return m_sema->getCounterValue();
-        }
-
-        bool notTimedOut(const uint64_t value, const uint64_t nanoseconds);
-
-        core::smart_refctd_ptr<ISemaphore> m_sema;
-        uint64_t m_greatestSignal;
-        uint64_t m_greatestLatch;
-};
-
-#if 0
-// Could be quite easily made MT and relatively lockless, if only had a good lock-poor circular buffer impl
-template<typename Functor>
-class TimelineEventHandlerST final : public TimelineEventHandlerBase
-{
-        constexpr static inline bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()()),bool>;
-        struct FunctorValuePair
-        {
-            Functor func;
-            uint64_t geSemaValue;
-        };
-        // could be a circular buffer but whatever
-        core::deque<FunctorValuePair> m_cb;
-        
-        inline uint32_t resetLatch()
-        {
-            m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue;
-            return m_cb.size();
-        }
-
-    public:
-        inline TimelineEventHandlerST(core::smart_refctd_ptr<ISemaphore>&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) :
-            TimelineEventHandlerBase(std::move(sema)), m_cb(initialCapacity)
-        {
-            resetLatch();
-        }
-        ~TimelineEventHandlerST()
-        {
-            while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {}
-        }
-
-        inline uint32_t count() const {return m_cb.size();}
-
-        // You can latch arbitrary functors upon the semaphore reaching some value
-        inline void latch(const uint64_t geSemaValue, Functor&& function)
-        {
-            //const auto oldValue = core::atomic_fetch_max(&m_greatestLatch,geSemaValue);
-            assert(geSemaValue>=m_greatestLatch); // you cannot latch out of order
-            m_greatestLatch = geSemaValue;
-            m_cb.emplace_back(std::move(function),geSemaValue);
-        }
-
-        // Returns number of events still outstanding
-        inline uint32_t poll(bool& bailed)
-        {
-            m_greatestSignal = m_sema->getCounterValue();
-            // in a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal
-            while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal)
-            {
-                bailed = false;
-                if constexpr (ReturnsBool)
-                    bailed = m_cb.front().func();
-                m_cb.pop_front();
-                if (bailed)
-                    break;
-            }
-            return resetLatch();
-        }
-        inline uint32_t poll()
-        {
-            bool dummy;
-            return poll(dummy);
-        }
-
-        template<class Clock, class Duration=typename Clock::duration>
-        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
-        {
-            if constexpr (ReturnsBool)
-            {
-                // Perf-assumption: there are no latched events with wait values less or equal to m_greatestSignal
-                // So we have a bunch of events with semaphore values between m_greatestSignal and m_greatestLatch
-#if 0
-                for (std::chrono::time_point<Clock, Duration> currentClockTime; (currentClockTime = Clock::now()) < timeout_time; )
-                while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal)
-                {
-                    const bool bail = m_cb.front().func();
-                    m_cb.pop_front();
-                    if (bail)
-                        return resetLatch();
-                }
-#endif
-            }
-            else
-            {
-                m_greatestSignal = singleSemaphoreWait(m_greatestLatch,timeout_time);
-                while (!m_cb.empty() && m_cb.front().geSemaValue<=m_greatestSignal)
-                {
-                    m_cb.front().func();
-                    m_cb.pop_front();
-                }
-            }
-            return resetLatch();
-        }
-
-        // The default behaviour of the underlying event handler is to wait for all events in its destructor.
-        // This will naturally cause you problems if you add functions latched on values you never signal,
-        // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock.
-        inline uint32_t abortOldest(const uint64_t upTo=~0ull)
-        {
-            m_greatestSignal = m_sema->getCounterValue();
-            while (!m_cb.empty() && m_cb.front().geSemaValue<=upTo)
-            {
-                // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady())
-                if (m_cb.front().geSemaValue<= m_greatestSignal)
-                    m_cb.front().func();
-                m_cb.pop_front();
-            }
-            return resetLatch();
-        }
-        inline uint32_t abortLatest(const uint64_t from=0ull)
-        {
-            m_greatestSignal = m_sema->getCounterValue();
-            while (!m_cb.empty() && m_cb.back().geSemaValue>=from)
-            {
-                // don't want non-determinitistic behaviour, so execute everything that would have been executed anyway with a while(pollForReady())
-                if (m_cb.back().geSemaValue<= m_greatestSignal)
-                    m_cb.back().func();
-                m_cb.pop_back();
-            }
-            return resetLatch();
-        }
-};
-
-template<typename Functor>
-class MultiTimelineEventHandlerST final
-{
-    public:
-        inline ~MultiTimelineEventHandlerST()
-        {
-            for (auto p : m_timelines)
-                delete p;
-        }
-
-        inline const auto& getTimelines() const {return m_timelines;}
-
-        // all the members are counteparts of the single timeline version
-        inline uint32_t count() const
-        {
-            uint32_t sum = 0;
-            for (auto p : m_timelines)
-                sum += p->count();
-            return sum;
-        }
-
-        inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function)
-        {
-            auto found = m_timelines.find(sema);
-            if (found==m_timelines.end())
-                found m_timelines.insert(found,new TimelineEventHandlerST(core::smart_refctd_ptr<ISemaphore>(sema)));
-            assert((*found)->getSemaphore()==sema);
-            found->latch(sema,geValue,std::move(function));
-        }
-
-        inline uint32_t poll()
-        {
-            uint32_t sum = 0;
-            for (auto p : m_timelines)
-            {
-                bool bailed;
-                p->poll(bailed);
-                if (bailed)
-                    break;
-            }
-            return sum;
-        }
-        template<class Clock, class Duration=typename Clock::duration>
-        inline uint32_t wait(const std::chrono::time_point<Clock, Duration>& timeout_time)
-        {
-            // want to give each event equal wait time, so interpolate (albeit weirdly)
-            return 455;
-        }
-
-        inline uint32_t abortOldest(const uint64_t upTo=~0ull)
-        {
-            uint32_t sum = 0;
-            for (auto p : m_timelines)
-                sum += p->abortOldest(upTo);
-            return sum;
-        }
-        inline uint32_t abortLatest(const uint64_t from=0ull)
-        {
-            uint32_t sum = 0;
-            for (auto p : m_timelines)
-                sum += p->abortLatest(from);
-            return sum;
-        }
-
-    private:
-        struct Compare
-        {
-            inline bool operator()(const TimelineEventHandlerST* lhs, const TimelineEventHandlerST* rhs) const
-            {
-                return lhs->getSemaphore()<rhs->getSemaphore();
-            }
-            inline bool operator()(const TimelineEventHandlerST* lhs, const ISemaphore* rhs) const
-            {
-                return lhs->getSemaphore()<rhs;
-            }
-        };
-        core::set<TimelineEventHandlerST*,Compare> m_timelines;
-};
-#endif
-
 }
 #endif
\ No newline at end of file
diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
new file mode 100644
index 0000000000..938e749622
--- /dev/null
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -0,0 +1,325 @@
+#ifndef _NBL_VIDEO_TIMELINE_EVENT_HANDLERS_H_INCLUDED_
+#define _NBL_VIDEO_TIMELINE_EVENT_HANDLERS_H_INCLUDED_
+
+
+#include "nbl/video/ILogicalDevice.h"
+
+#include <chrono>
+
+
+namespace nbl::video
+{
+
+// Could be made MT and relatively lockless, if only had a good lock-few circular buffer impl
+// Not sure its worth the effort as anything using this will probably need to be lockful to be MT
+template<typename Functor>
+class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
+{
+        struct FunctorValuePair
+        {
+            Functor func;
+            uint64_t geSemaValue;
+        };
+        // could be a circular buffer but whatever for now
+        core::deque<FunctorValuePair> m_cb;
+        core::smart_refctd_ptr<ISemaphore> m_sema;
+        uint64_t m_greatestSignal;
+        uint64_t m_greatestLatch;
+        
+        template<bool QueryCounter=true, typename Lambda>
+        inline uint32_t for_each_popping(Lambda&& l)
+        {
+            if (m_cb.empty())
+                return 0;
+
+            if (QueryCounter)
+                m_greatestSignal = m_sema->getCounterValue();
+            // In a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal
+            // the way that it would happen is we'd `reserveLock` everything in the buffer so far
+            // then rewind the reservation for anything that doesn't meet the predicate.
+            // For this to work, the predicate needs to be "consistent" meaning no holes can be formed by multiple actors.
+            while (!m_cb.empty() && l(m_cb.front()))
+                m_cb.pop_front();
+            m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue;
+            return static_cast<uint32_t>(m_cb.size());
+        }
+
+        inline auto constructNonBailing()
+        {
+            return [&](FunctorValuePair& p) -> bool
+            {
+                if (p.geSemaValue>m_greatestSignal)
+                    return false;
+                p.func();
+                return true;
+            };
+        }
+        inline auto constructBailing(bool& bailed)
+        {
+            return [&](FunctorValuePair& p) -> bool
+            {
+                if (p.geSemaValue>m_greatestSignal)
+                    return false;
+                const bool last_bailed = bailed;
+                bailed = p.func();
+                return !last_bailed;
+            };
+        }
+
+        // If the functor returns bool, then we bail on the on the first executed event during wait,poll,etc.
+        constexpr static inline bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()()),bool>;
+
+    public:
+        // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent.
+        inline TimelineEventHandlerST(core::smart_refctd_ptr<ISemaphore>&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) :
+            m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {}
+        // If you don't want to deadlock here, look into the `abort*` family of methods
+        ~TimelineEventHandlerST()
+        {
+            while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {}
+        }
+        // little utility
+        inline ISemaphore* getSemaphore() const {return m_sema.get();}
+
+        inline uint32_t count() const {return m_cb.size();}
+
+        // You can latch arbitrary functors upon the semaphore reaching some value
+        inline void latch(const uint64_t geSemaValue, Functor&& function)
+        {
+            //const auto oldValue = core::atomic_fetch_max(&m_greatestLatch,geSemaValue);
+            assert(geSemaValue>=m_greatestLatch); // you cannot latch out of order
+            m_greatestLatch = geSemaValue;
+            m_cb.emplace_back(std::move(function),geSemaValue);
+        }
+
+        // Returns number of events still outstanding
+        inline uint32_t poll(bool& bailed)
+        {
+            bailed = false;
+            if constexpr (ReturnsBool)
+                return for_each_popping(constructBailing(bailed));
+            else
+                return for_each_popping(constructNonBailing());
+        }
+        inline uint32_t poll()
+        {
+            bool dummy;
+            return poll(dummy);
+        }
+
+        template<class Clock, class Duration=typename Clock::duration>
+        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
+        {
+            if (m_cb.empty())
+                return 0;
+
+            auto singleSemaphoreWait = [&](const uint64_t waitVal, const std::chrono::time_point<Clock,Duration>& waitPoint)->uint64_t
+            {
+                const auto current_time = Clock::now();
+                if (waitPoint>current_time)
+                {
+                    auto device = const_cast<ILogicalDevice*>(m_sema->getOriginDevice());
+                    const auto nanosecondsLeft = std::chrono::duration_cast<std::chrono::nanoseconds>(waitPoint-current_time).count();
+                    const ISemaphore::SWaitInfo info = {.semaphore=m_sema.get(),.value = waitVal};
+                    if (device->waitForSemaphores({&info,1},true,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS)
+                        return waitVal>m_greatestSignal ? waitVal:m_greatestSignal; // remeber that latch can move back, not signal though
+                }
+                return m_sema->getCounterValue();
+            };
+
+            if constexpr (ReturnsBool)
+            {
+                // Perf-assumption: there are probably no latched events with wait values less or equal to `m_greatestSignal`
+                // So we have a bunch of events with semaphore values between `m_greatestSignal` and `m_greatestLatch` with
+                // lots of repeated latch values incrementing by a fixed K amount between each batch of repeats
+                auto currentTime = Clock::now();
+                do
+                {
+                    // We cannot wait for the original timeout point because we want to be able to bail, so increment slowly
+                    const auto uniqueValueEstimate = core::min(m_cb.size(),m_greatestSignal-m_greatestLatch);
+                    // weird interpolation that works on integers, basically trying to get somethign 1/uniqueValueEstimate of the way from now to original timeout point
+                    const std::chrono::time_point<Clock> singleWaitTimePt((currentTime.time_since_epoch()*(uniqueValueEstimate-1u)+timeout_time.time_since_epoch())/uniqueValueEstimate);
+                    // So we only Semaphore wait for the next latch value we need
+                    m_greatestSignal = singleSemaphoreWait(m_cb.front().geSemaValue,singleWaitTimePt);
+
+                    bool bailed = false;
+                    for_each_popping<false>(constructBailing(bailed));
+                    if (bailed)
+                        break;
+                } while ((currentTime=Clock::now())<timeout_time);
+                return m_cb.size();
+            }
+            else
+            {
+                m_greatestSignal = singleSemaphoreWait(m_greatestLatch,timeout_time);
+                return for_each_popping<false>(constructNonBailing());
+            }
+        }
+
+        // The default behaviour of the underlying event handler is to wait for all events in its destructor.
+        // This will naturally cause you problems if you add functions latched on values you never signal,
+        // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock.
+        inline uint32_t abortOldest(const uint64_t upTo)
+        {
+            return for_each_popping([&](FunctorValuePair& p) -> bool
+                {
+                    if (p.geSemaValue>upTo)
+                        return false;
+                    // don't want weird behaviour, so execute everything that would have been executed
+                    // if a single `poll()` was called before `abortOldest`
+                    if (p.geSemaValue<=m_greatestSignal)
+                        p.func();
+                    return true;
+                }
+            );
+        }
+        inline uint32_t abortLatest(const uint64_t from)
+        {
+            // We also need to run the functors in the same order they'd be ran with a single `poll()`,
+            // so we run all of them from the front, not just from the `from` value.
+            for_each_popping(constructNonBailing());
+            // now kill the latest stuff
+            while (!m_cb.empty() && m_cb.back().geSemaValue>=from)
+                m_cb.pop_back();
+            return m_cb.size();
+        }
+        inline void abortAll() {abortOldest(~0ull);}
+};
+
+//
+template<typename Functor>
+class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
+{
+    public:
+        using TimelineEventHandler = TimelineEventHandlerST<Functor>;
+        inline ~MultiTimelineEventHandlerST()
+        {
+            clear();
+        }
+
+        inline const auto& getTimelines() const {return m_timelines;}
+
+        // all the members are counteparts of the single timeline version
+        inline uint32_t count() const
+        {
+            uint32_t sum = 0;
+            for (auto p : m_timelines)
+                sum += p->count();
+            return sum;
+        }
+
+        inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function)
+        {
+            auto found = m_timelines.find(sema);
+            if (found==m_timelines.end())
+            {
+                STimeline newTimeline = {
+                    .handler = new TimelineEventHandler(core::smart_refctd_ptr<ISemaphore>(sema)),
+                    .waitInfoIx = m_scratchWaitInfos.size()
+                };
+                found = m_timelines.insert(found,std::move(newTimeline));
+                m_scratchWaitInfos.emplace_back(sema,0xdeadbeefBADC0FFEull);
+            }
+            assert(found->handler->getSemaphore()==sema);
+            found->handler->latch(sema,geValue,std::move(function));
+        }
+
+        inline uint32_t poll()
+        {
+            uint32_t sum = 0;
+            for (auto p : m_timelines)
+            {
+                bool bailed;
+                p->poll(bailed);
+                if (bailed)
+                    break;
+            }
+            return sum;
+        }
+
+#if 0
+        template<class Clock, class Duration=typename Clock::duration>
+        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
+        {
+            return 455;
+        }
+#endif
+
+        inline void abortAll()
+        {
+            for (auto& p : m_timelines)
+                p.handler->abortAll();
+            clear();
+        }
+        inline uint32_t abortOldest(const uint64_t upTo=~0ull)
+        {
+            uint32_t sum = 0;
+            for (auto& p : m_timelines)
+                sum += p.handler->abortOldest(upTo);
+            return sum;
+        }
+        inline uint32_t abortLatest(const uint64_t from=0ull)
+        {
+            uint32_t sum = 0;
+            for (auto& p : m_timelines)
+                sum += p.handler->abortLatest(from);
+            return sum;
+        }
+
+    private:
+        struct STimeline
+        {
+            inline auto operator<=>(const STimeline& rhs) const
+            {
+                return handler->getSemaphore()-rhs.handler->getSemaphore();
+            }
+            inline auto operator<=>(const ISemaphore* rhs) const
+            {
+                return handler->getSemaphore()-rhs;
+            }
+
+            TimelineEventHandler* handler;
+            size_t waitInfoIx;
+        };
+        // We use a `set<>` instead of `unordered_set<>` because we assume you won't spam semaphores/timelines
+        using container_t = core::set<STimeline>;
+
+        template<typename Lambda>
+        inline uint32_t for_each_erasing(Lambda&& l)
+        {
+            uint32_t sum = 0;
+            // we don't check erasing when l(*it)==false on purpose, it only happens in poll and the timeline semaphore is likely to get re-added
+            for (auto it=m_timelines.begin(); it!=m_timelines.end() && l(*it); )
+                it = it->handler->count() ? (it++):eraseTimeline(it);
+            return sum;
+        }
+
+        inline container_t::iterator eraseTimeline(container_t::iterator timeline)
+        {
+            // if not the last in scratch
+            if (timeline->waitInfoIx<m_scratchWaitInfos.size())
+            {
+                // swap the mapping with the end scratch element
+                const auto& lastScratch = m_scratchWaitInfos.back();
+                m_timelines[lastScratch.semaphore].waitInfoIx = timeline->waitInfoIx;
+                m_scratchWaitInfos[timeline->waitInfoIx] = lastScratch;
+            }
+            m_scratchWaitInfos.pop_back();
+            delete timeline->handler;
+            return m_timelines.erase(timeline);
+        }
+
+        inline void clear()
+        {
+            m_scratchWaitInfos.clear();
+            for (auto p : m_timelines)
+                delete p.handler;
+            m_timelines.clear();
+        }
+
+        container_t m_timelines;
+        core::vector<ISemaphore::SWaitInfo> m_scratchWaitInfos;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 7c7719052e..8297a09692 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -275,7 +275,6 @@ set(NBL_VIDEO_SOURCES
 	${NBL_ROOT_PATH}/src/nbl/video/ILogicalDevice.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IGPUAccelerationStructure.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IGPUCommandBuffer.cpp
-	${NBL_ROOT_PATH}/src/nbl/video/ISemaphore.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IQueue.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IGPUDescriptorSet.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/IDeviceMemoryAllocation.cpp
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 002dad3ae7..0714481ac8 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -79,8 +79,10 @@ core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(ISemaph
 
     return core::make_smart_refctd_ptr<CVulkanSemaphore>(core::smart_refctd_ptr<CVulkanLogicalDevice>(this), semaphore, std::move(params));
 }
-auto CVulkanLogicalDevice::waitForSemaphores(const std::span<const SSemaphoreWaitInfo> infos, const bool waitAll, const uint64_t timeout) -> WAIT_RESULT
+ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll, const uint64_t timeout)
 {
+    using retval_t = ISemaphore::WAIT_RESULT;
+
     core::vector<VkSemaphore> semaphores(infos.size());
     core::vector<uint64_t> values(infos.size());
     auto outSemaphores = semaphores.data();
@@ -89,7 +91,7 @@ auto CVulkanLogicalDevice::waitForSemaphores(const std::span<const SSemaphoreWai
     {
         auto sema = IBackendObject::device_compatibility_cast<const CVulkanSemaphore*>(info.semaphore,this);
         if (!sema)
-            WAIT_RESULT::_ERROR;
+            retval_t::_ERROR;
         *(outSemaphores++) = sema->getInternalObject();
         *(outValues++) = info.value;
     }
@@ -102,15 +104,15 @@ auto CVulkanLogicalDevice::waitForSemaphores(const std::span<const SSemaphoreWai
     switch (m_devf.vk.vkWaitSemaphores(m_vkdev,&waitInfo,timeout))
     {
         case VK_SUCCESS:
-            return WAIT_RESULT::SUCCESS;
+            return retval_t::SUCCESS;
         case VK_TIMEOUT:
-            return WAIT_RESULT::TIMEOUT;
+            return retval_t::TIMEOUT;
         case VK_ERROR_DEVICE_LOST:
-            return WAIT_RESULT::DEVICE_LOST;
+            return retval_t::DEVICE_LOST;
         default:
             break;
     }
-    return WAIT_RESULT::_ERROR;
+    return retval_t::_ERROR;
 }
 
 core::smart_refctd_ptr<IEvent> CVulkanLogicalDevice::createEvent(const IEvent::CREATE_FLAGS flags)
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index d8f934ceb9..0df38ffd67 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -53,7 +53,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice
         }
             
         core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&&) override;
-        WAIT_RESULT waitForSemaphores(const std::span<const SSemaphoreWaitInfo> infos, const bool waitAll, const uint64_t timeout) override;
+        ISemaphore::WAIT_RESULT waitForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll, const uint64_t timeout) override;
             
         core::smart_refctd_ptr<IEvent> createEvent(const IEvent::CREATE_FLAGS flags) override;
               
diff --git a/src/nbl/video/ISemaphore.cpp b/src/nbl/video/ISemaphore.cpp
deleted file mode 100644
index 18eca04e5a..0000000000
--- a/src/nbl/video/ISemaphore.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "nbl/video/ISemaphore.h"
-#include "nbl/video/ILogicalDevice.h"
-
-namespace nbl::video
-{
-
-bool TimelineEventHandlerBase::notTimedOut(const uint64_t value, const uint64_t nanoseconds)
-{
-    const ILogicalDevice::SSemaphoreWaitInfo info = {.semaphore=m_sema.get(),.value=value};
-    switch (const_cast<ILogicalDevice*>(m_sema->getOriginDevice())->waitForSemaphores({&info,1},true,nanoseconds))
-    {
-        case ILogicalDevice::WAIT_RESULT::TIMEOUT:
-            return false;
-            break;
-        default: 
-            break;
-    }
-    return true;
-}
-
-}
\ No newline at end of file

From a1afcc88ce310b354941fa923aa90b5c67d66a55 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 19:17:14 +0100
Subject: [PATCH 19/62] Made the TimelineEventHandlerST use a const ISemaphore,
 almost all of MultiTimelineEventHandlerST is implemented

---
 include/nbl/video/TimelineEventHandlers.h | 215 ++++++++++++++--------
 1 file changed, 134 insertions(+), 81 deletions(-)

diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 938e749622..865d3b6469 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -9,12 +9,16 @@
 
 namespace nbl::video
 {
+template<typename Functor>
+class MultiTimelineEventHandlerST;
 
 // Could be made MT and relatively lockless, if only had a good lock-few circular buffer impl
 // Not sure its worth the effort as anything using this will probably need to be lockful to be MT
 template<typename Functor>
 class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
 {
+        friend MultiTimelineEventHandlerST<Functor>;
+
         struct FunctorValuePair
         {
             Functor func;
@@ -22,10 +26,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         };
         // could be a circular buffer but whatever for now
         core::deque<FunctorValuePair> m_cb;
-        core::smart_refctd_ptr<ISemaphore> m_sema;
+        core::smart_refctd_ptr<const ISemaphore> m_sema;
         uint64_t m_greatestSignal;
         uint64_t m_greatestLatch;
-        
+
         template<bool QueryCounter=true, typename Lambda>
         inline uint32_t for_each_popping(Lambda&& l)
         {
@@ -44,34 +48,33 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             return static_cast<uint32_t>(m_cb.size());
         }
 
-        inline auto constructNonBailing()
+        template<typename... Args>
+        inline auto constructNonBailing(Args&&... args)
         {
             return [&](FunctorValuePair& p) -> bool
             {
                 if (p.geSemaValue>m_greatestSignal)
                     return false;
-                p.func();
+                p.func(std::forward<Args>(args)...);
                 return true;
             };
         }
-        inline auto constructBailing(bool& bailed)
+        template<typename... Args>
+        inline auto constructBailing(bool& bailed, Args&&... args)
         {
             return [&](FunctorValuePair& p) -> bool
             {
                 if (p.geSemaValue>m_greatestSignal)
                     return false;
                 const bool last_bailed = bailed;
-                bailed = p.func();
+                bailed = p.func(std::forward<Args>(args)...);
                 return !last_bailed;
             };
         }
 
-        // If the functor returns bool, then we bail on the on the first executed event during wait,poll,etc.
-        constexpr static inline bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()()),bool>;
-
     public:
         // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent.
-        inline TimelineEventHandlerST(core::smart_refctd_ptr<ISemaphore>&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) :
+        inline TimelineEventHandlerST(core::smart_refctd_ptr<const ISemaphore>&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) :
             m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {}
         // If you don't want to deadlock here, look into the `abort*` family of methods
         ~TimelineEventHandlerST()
@@ -79,7 +82,7 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {}
         }
         // little utility
-        inline ISemaphore* getSemaphore() const {return m_sema.get();}
+        inline const ISemaphore* getSemaphore() const {return m_sema.get();}
 
         inline uint32_t count() const {return m_cb.size();}
 
@@ -92,29 +95,36 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             m_cb.emplace_back(std::move(function),geSemaValue);
         }
 
-        // Returns number of events still outstanding
-        inline uint32_t poll(bool& bailed)
+        //
+        struct PollResult
         {
-            bailed = false;
+            uint32_t eventsLeft = ~0u;
+            bool bailed = false;
+        };
+        template<typename... Args>
+        inline PollResult poll(Args&&... args)
+        {
+            PollResult retval = {};
+            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)),bool>;
             if constexpr (ReturnsBool)
-                return for_each_popping(constructBailing(bailed));
+                retval.eventsLeft = for_each_popping(constructBailing(retval.bailed,std::forward<Args>(args)...));
             else
-                return for_each_popping(constructNonBailing());
-        }
-        inline uint32_t poll()
-        {
-            bool dummy;
-            return poll(dummy);
+                retval.eventsLeft = for_each_popping(constructNonBailing(std::forward<Args>(args)...));
+            return retval;
         }
 
-        template<class Clock, class Duration=typename Clock::duration>
-        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
+        template<class Clock, class Duration=typename Clock::duration, typename... Args>
+        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time, Args&&... args)
         {
             if (m_cb.empty())
                 return 0;
 
-            auto singleSemaphoreWait = [&](const uint64_t waitVal, const std::chrono::time_point<Clock,Duration>& waitPoint)->uint64_t
+            auto singleSemaphoreWait = [&](const uint64_t waitVal, const std::chrono::time_point<Clock,Duration>& waitPoint) -> void
             {
+                // remeber that latch can move back, not signal though
+                if (waitVal<=m_greatestSignal)
+                    return;
+
                 const auto current_time = Clock::now();
                 if (waitPoint>current_time)
                 {
@@ -122,11 +132,12 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                     const auto nanosecondsLeft = std::chrono::duration_cast<std::chrono::nanoseconds>(waitPoint-current_time).count();
                     const ISemaphore::SWaitInfo info = {.semaphore=m_sema.get(),.value = waitVal};
                     if (device->waitForSemaphores({&info,1},true,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS)
-                        return waitVal>m_greatestSignal ? waitVal:m_greatestSignal; // remeber that latch can move back, not signal though
+                        m_greatestSignal = waitVal;
                 }
-                return m_sema->getCounterValue();
+                m_greatestSignal = m_sema->getCounterValue();
             };
-
+            
+            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)),bool>;
             if constexpr (ReturnsBool)
             {
                 // Perf-assumption: there are probably no latched events with wait values less or equal to `m_greatestSignal`
@@ -140,10 +151,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                     // weird interpolation that works on integers, basically trying to get somethign 1/uniqueValueEstimate of the way from now to original timeout point
                     const std::chrono::time_point<Clock> singleWaitTimePt((currentTime.time_since_epoch()*(uniqueValueEstimate-1u)+timeout_time.time_since_epoch())/uniqueValueEstimate);
                     // So we only Semaphore wait for the next latch value we need
-                    m_greatestSignal = singleSemaphoreWait(m_cb.front().geSemaValue,singleWaitTimePt);
+                    singleSemaphoreWait(m_cb.front().geSemaValue,singleWaitTimePt);
 
                     bool bailed = false;
-                    for_each_popping<false>(constructBailing(bailed));
+                    for_each_popping<false>(constructBailing(bailed,std::forward<Args>(args)...));
                     if (bailed)
                         break;
                 } while ((currentTime=Clock::now())<timeout_time);
@@ -151,15 +162,16 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             }
             else
             {
-                m_greatestSignal = singleSemaphoreWait(m_greatestLatch,timeout_time);
-                return for_each_popping<false>(constructNonBailing());
+                singleSemaphoreWait(m_greatestLatch,timeout_time);
+                return for_each_popping<false>(constructNonBailing(std::forward<Args>(args)...));
             }
         }
 
         // The default behaviour of the underlying event handler is to wait for all events in its destructor.
         // This will naturally cause you problems if you add functions latched on values you never signal,
         // such as when you change your mind whether to submit. This method is then helpful to avoid a deadlock.
-        inline uint32_t abortOldest(const uint64_t upTo)
+        template<typename... Args>
+        inline uint32_t abortOldest(const uint64_t upTo, Args&&... args)
         {
             return for_each_popping([&](FunctorValuePair& p) -> bool
                 {
@@ -168,22 +180,24 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                     // don't want weird behaviour, so execute everything that would have been executed
                     // if a single `poll()` was called before `abortOldest`
                     if (p.geSemaValue<=m_greatestSignal)
-                        p.func();
+                        p.func(std::forward<Args>(args)...);
                     return true;
                 }
             );
         }
-        inline uint32_t abortLatest(const uint64_t from)
+        template<typename... Args>
+        inline uint32_t abortLatest(const uint64_t from, Args&&... args)
         {
             // We also need to run the functors in the same order they'd be ran with a single `poll()`,
             // so we run all of them from the front, not just from the `from` value.
-            for_each_popping(constructNonBailing());
+            for_each_popping(constructNonBailing(std::forward<Args>(args)...));
             // now kill the latest stuff
             while (!m_cb.empty() && m_cb.back().geSemaValue>=from)
                 m_cb.pop_back();
             return m_cb.size();
         }
-        inline void abortAll() {abortOldest(~0ull);}
+        template<typename... Args>
+        inline void abortAll(Args&&... args) {abortOldest(~0ull,std::forward<Args>(args)...);}
 };
 
 //
@@ -192,6 +206,8 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
 {
     public:
         using TimelineEventHandler = TimelineEventHandlerST<Functor>;
+
+        inline MultiTimelineEventHandlerST(core::smart_refctd_ptr<ILogicalDevice>&& device) : m_device(std::move(device)) {}
         inline ~MultiTimelineEventHandlerST()
         {
             clear();
@@ -204,46 +220,104 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         {
             uint32_t sum = 0;
             for (auto p : m_timelines)
-                sum += p->count();
+                sum += p.handler->count();
             return sum;
         }
 
-        inline void latch(ISemaphore* sema, const uint64_t geValue, Functor&& function)
+        inline bool latch(const ISemaphore::SWaitInfo& futureWait, Functor&& function)
         {
-            auto found = m_timelines.find(sema);
+            auto found = m_timelines.find(futureWait.semaphore);
             if (found==m_timelines.end())
             {
+                if (futureWait.semaphore->getOriginDevice()!=m_device.get())
+                    return false;
                 STimeline newTimeline = {
-                    .handler = new TimelineEventHandler(core::smart_refctd_ptr<ISemaphore>(sema)),
+                    .handler = new TimelineEventHandler(core::smart_refctd_ptr<const ISemaphore>(futureWait.semaphore)),
                     .waitInfoIx = m_scratchWaitInfos.size()
                 };
                 found = m_timelines.insert(found,std::move(newTimeline));
-                m_scratchWaitInfos.emplace_back(sema,0xdeadbeefBADC0FFEull);
+                m_scratchWaitInfos.emplace_back(futureWait.semaphore,0xdeadbeefBADC0FFEull);
             }
-            assert(found->handler->getSemaphore()==sema);
-            found->handler->latch(sema,geValue,std::move(function));
+            assert(found->handler->getSemaphore()==futureWait.semaphore);
+            found->handler->latch(futureWait.value,std::move(function));
+            return true;
         }
 
-        inline uint32_t poll()
-        {
-            uint32_t sum = 0;
-            for (auto p : m_timelines)
+        template<typename... Args>
+        inline typename TimelineEventHandler::PollResult poll(Args&&... args)
+        {            
+            typename TimelineEventHandler::PollResult retval = {0,false};
+            for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); )
             {
-                bool bailed;
-                p->poll(bailed);
-                if (bailed)
-                    break;
+                if (!retval.bailed)
+                {
+                    const auto local = it->handler->poll();
+                    retval.eventsLeft += local.eventsLeft;
+                    retval.bailed = local.bailed;
+                }
+                if (it->handler->count())
+                    it++;
+                else
+                    it = eraseTimeline(it);
             }
-            return sum;
+            return retval;
         }
 
-#if 0
         template<class Clock, class Duration=typename Clock::duration>
         inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
         {
+            bool allEmpty = true;
+            for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); )
+            {
+                if (it->handler->count())
+                {
+#if 0
+                    // TODO: adapt
+                    const waitVal = it->handler->m_greatestLatch;
+                    // need to fill all waits anyway
+                    m_scratchWaitInfos[it->waitInfoIx].value = waitVal;
+                    // remeber that latch can move back, not signal though
+                    if (waitVal>it->handler->m_greatestSignal)
+                        allEmpty = false;
+#endif
+                    it++;
+                }
+                else
+                    it = eraseTimeline(it);
+            }
+            if (allEmpty)
+                return 0;
+            
+            constexpr bool ReturnsBool = false;
+            auto singleSemaphoreWait = [&](const std::chrono::time_point<Clock,Duration>& waitPoint) -> bool
+            {
+                const auto current_time = Clock::now();
+                if (waitPoint>current_time)
+                {
+                    const auto nanosecondsLeft = std::chrono::duration_cast<std::chrono::nanoseconds>(waitPoint-current_time).count();
+                    if (m_device->waitForSemaphores(m_scratchWaitInfos,!ReturnsBool,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS)
+                        return true; // remeber that latch can move back, not signal though
+                }
+                //
+
+                return false;
+            };
+
+            if constexpr (ReturnsBool)
+            {
+                return 600;
+            }
+            else
+            {
+                if (singleSemaphoreWait(timeout_time))
+                {
+                    clear();
+                    return 0;
+                }
+            }
+
             return 455;
         }
-#endif
 
         inline void abortAll()
         {
@@ -251,20 +325,6 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                 p.handler->abortAll();
             clear();
         }
-        inline uint32_t abortOldest(const uint64_t upTo=~0ull)
-        {
-            uint32_t sum = 0;
-            for (auto& p : m_timelines)
-                sum += p.handler->abortOldest(upTo);
-            return sum;
-        }
-        inline uint32_t abortLatest(const uint64_t from=0ull)
-        {
-            uint32_t sum = 0;
-            for (auto& p : m_timelines)
-                sum += p.handler->abortLatest(from);
-            return sum;
-        }
 
     private:
         struct STimeline
@@ -282,26 +342,18 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             size_t waitInfoIx;
         };
         // We use a `set<>` instead of `unordered_set<>` because we assume you won't spam semaphores/timelines
-        using container_t = core::set<STimeline>;
-
-        template<typename Lambda>
-        inline uint32_t for_each_erasing(Lambda&& l)
-        {
-            uint32_t sum = 0;
-            // we don't check erasing when l(*it)==false on purpose, it only happens in poll and the timeline semaphore is likely to get re-added
-            for (auto it=m_timelines.begin(); it!=m_timelines.end() && l(*it); )
-                it = it->handler->count() ? (it++):eraseTimeline(it);
-            return sum;
-        }
+        // also we need to be able to continue iteration after an erasure of a single element
+        using container_t = core::set<STimeline,std::less<void>/*quirk of STL*/>;
 
-        inline container_t::iterator eraseTimeline(container_t::iterator timeline)
+        inline container_t::iterator eraseTimeline(typename container_t::iterator timeline)
         {
             // if not the last in scratch
             if (timeline->waitInfoIx<m_scratchWaitInfos.size())
             {
                 // swap the mapping with the end scratch element
                 const auto& lastScratch = m_scratchWaitInfos.back();
-                m_timelines[lastScratch.semaphore].waitInfoIx = timeline->waitInfoIx;
+                typename container_t::iterator found = m_timelines.find(lastScratch.semaphore);
+//                found->waitInfoIx = timeline->waitInfoIx;
                 m_scratchWaitInfos[timeline->waitInfoIx] = lastScratch;
             }
             m_scratchWaitInfos.pop_back();
@@ -319,6 +371,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
 
         container_t m_timelines;
         core::vector<ISemaphore::SWaitInfo> m_scratchWaitInfos;
+        core::smart_refctd_ptr<ILogicalDevice> m_device;
 };
 
 }

From 262281fecc6e05bd93ac73455f91384050d156b1 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 21:31:14 +0100
Subject: [PATCH 20/62] implement MultiTimelineEventHandlerST and correct
 TimelineEventHandlerST

---
 include/nbl/video/TimelineEventHandlers.h | 269 +++++++++++++---------
 1 file changed, 159 insertions(+), 110 deletions(-)

diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 865d3b6469..0f4e7015a7 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -17,64 +17,9 @@ class MultiTimelineEventHandlerST;
 template<typename Functor>
 class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
 {
-        friend MultiTimelineEventHandlerST<Functor>;
-
-        struct FunctorValuePair
-        {
-            Functor func;
-            uint64_t geSemaValue;
-        };
-        // could be a circular buffer but whatever for now
-        core::deque<FunctorValuePair> m_cb;
-        core::smart_refctd_ptr<const ISemaphore> m_sema;
-        uint64_t m_greatestSignal;
-        uint64_t m_greatestLatch;
-
-        template<bool QueryCounter=true, typename Lambda>
-        inline uint32_t for_each_popping(Lambda&& l)
-        {
-            if (m_cb.empty())
-                return 0;
-
-            if (QueryCounter)
-                m_greatestSignal = m_sema->getCounterValue();
-            // In a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal
-            // the way that it would happen is we'd `reserveLock` everything in the buffer so far
-            // then rewind the reservation for anything that doesn't meet the predicate.
-            // For this to work, the predicate needs to be "consistent" meaning no holes can be formed by multiple actors.
-            while (!m_cb.empty() && l(m_cb.front()))
-                m_cb.pop_front();
-            m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue;
-            return static_cast<uint32_t>(m_cb.size());
-        }
-
-        template<typename... Args>
-        inline auto constructNonBailing(Args&&... args)
-        {
-            return [&](FunctorValuePair& p) -> bool
-            {
-                if (p.geSemaValue>m_greatestSignal)
-                    return false;
-                p.func(std::forward<Args>(args)...);
-                return true;
-            };
-        }
-        template<typename... Args>
-        inline auto constructBailing(bool& bailed, Args&&... args)
-        {
-            return [&](FunctorValuePair& p) -> bool
-            {
-                if (p.geSemaValue>m_greatestSignal)
-                    return false;
-                const bool last_bailed = bailed;
-                bailed = p.func(std::forward<Args>(args)...);
-                return !last_bailed;
-            };
-        }
-
     public:
         // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent.
-        inline TimelineEventHandlerST(core::smart_refctd_ptr<const ISemaphore>&& sema, const uint64_t initialCapacity = 4095 / sizeof(FunctorValuePair) + 1) :
+        inline TimelineEventHandlerST(core::smart_refctd_ptr<const ISemaphore>&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) :
             m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {}
         // If you don't want to deadlock here, look into the `abort*` family of methods
         ~TimelineEventHandlerST()
@@ -104,13 +49,7 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         template<typename... Args>
         inline PollResult poll(Args&&... args)
         {
-            PollResult retval = {};
-            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)),bool>;
-            if constexpr (ReturnsBool)
-                retval.eventsLeft = for_each_popping(constructBailing(retval.bailed,std::forward<Args>(args)...));
-            else
-                retval.eventsLeft = for_each_popping(constructNonBailing(std::forward<Args>(args)...));
-            return retval;
+            return poll_impl<true>(std::forward<Args>(args)...);
         }
 
         template<class Clock, class Duration=typename Clock::duration, typename... Args>
@@ -132,7 +71,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                     const auto nanosecondsLeft = std::chrono::duration_cast<std::chrono::nanoseconds>(waitPoint-current_time).count();
                     const ISemaphore::SWaitInfo info = {.semaphore=m_sema.get(),.value = waitVal};
                     if (device->waitForSemaphores({&info,1},true,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS)
+                    {
                         m_greatestSignal = waitVal;
+                        return;
+                    }
                 }
                 m_greatestSignal = m_sema->getCounterValue();
             };
@@ -198,6 +140,74 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         }
         template<typename... Args>
         inline void abortAll(Args&&... args) {abortOldest(~0ull,std::forward<Args>(args)...);}
+
+    private:
+        friend MultiTimelineEventHandlerST<Functor>;
+
+        struct FunctorValuePair
+        {
+            Functor func;
+            uint64_t geSemaValue;
+        };
+        // could be a circular buffer but whatever for now
+        core::deque<FunctorValuePair> m_cb;
+        core::smart_refctd_ptr<const ISemaphore> m_sema;
+        uint64_t m_greatestSignal;
+        uint64_t m_greatestLatch;
+
+        template<bool QueryCounter=true, typename Lambda>
+        inline uint32_t for_each_popping(Lambda&& l)
+        {
+            if (m_cb.empty())
+                return 0;
+
+            if (QueryCounter)
+                m_greatestSignal = m_sema->getCounterValue();
+            // In a threadsafe scenario, you'd immediately pop everything you can with geSemaValue<=signal
+            // the way that it would happen is we'd `reserveLock` everything in the buffer so far
+            // then rewind the reservation for anything that doesn't meet the predicate.
+            // For this to work, the predicate needs to be "consistent" meaning no holes can be formed by multiple actors.
+            while (!m_cb.empty() && l(m_cb.front()))
+                m_cb.pop_front();
+            m_greatestLatch = m_cb.empty() ? 0:m_cb.back().geSemaValue;
+            return static_cast<uint32_t>(m_cb.size());
+        }
+
+        template<typename... Args>
+        inline auto constructNonBailing(Args&&... args)
+        {
+            return [&](FunctorValuePair& p) -> bool
+            {
+                if (p.geSemaValue>m_greatestSignal)
+                    return false;
+                p.func(std::forward<Args>(args)...);
+                return true;
+            };
+        }
+        template<typename... Args>
+        inline auto constructBailing(bool& bailed, Args&&... args)
+        {
+            return [&](FunctorValuePair& p) -> bool
+            {
+                if (p.geSemaValue>m_greatestSignal)
+                    return false;
+                const bool last_bailed = bailed;
+                bailed = p.func(std::forward<Args>(args)...);
+                return !last_bailed;
+            };
+        }
+
+        template<bool QueryCounter, typename... Args>
+        inline PollResult poll_impl(Args&&... args)
+        {
+            PollResult retval = {};
+            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)), bool>;
+            if constexpr (ReturnsBool)
+                retval.eventsLeft = for_each_popping<QueryCounter>(constructBailing(retval.bailed, std::forward<Args>(args)...));
+            else
+                retval.eventsLeft = for_each_popping<QueryCounter>(constructNonBailing(std::forward<Args>(args)...));
+            return retval;
+        }
 };
 
 //
@@ -247,11 +257,11 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         inline typename TimelineEventHandler::PollResult poll(Args&&... args)
         {            
             typename TimelineEventHandler::PollResult retval = {0,false};
-            for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); )
+            for (auto it=m_timelines.begin(); it!=m_timelines.end(); )
             {
                 if (!retval.bailed)
                 {
-                    const auto local = it->handler->poll();
+                    const auto local = it->handler->poll(std::forward<Args>(args)...);
                     retval.eventsLeft += local.eventsLeft;
                     retval.bailed = local.bailed;
                 }
@@ -263,60 +273,88 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             return retval;
         }
 
-        template<class Clock, class Duration=typename Clock::duration>
-        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time)
+        template<class Clock, class Duration=typename Clock::duration, typename... Args>
+        inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time, Args&&... args)
         {
-            bool allEmpty = true;
-            for (typename container_t::iterator it=m_timelines.begin(); it!=m_timelines.end(); )
-            {
-                if (it->handler->count())
-                {
-#if 0
-                    // TODO: adapt
-                    const waitVal = it->handler->m_greatestLatch;
-                    // need to fill all waits anyway
-                    m_scratchWaitInfos[it->waitInfoIx].value = waitVal;
-                    // remeber that latch can move back, not signal though
-                    if (waitVal>it->handler->m_greatestSignal)
-                        allEmpty = false;
-#endif
-                    it++;
-                }
-                else
-                    it = eraseTimeline(it);
-            }
-            if (allEmpty)
-                return 0;
-            
-            constexpr bool ReturnsBool = false;
-            auto singleSemaphoreWait = [&](const std::chrono::time_point<Clock,Duration>& waitPoint) -> bool
+            auto nanosecondsLeft = [](const std::chrono::time_point<Clock,Duration>& waitPoint)->uint64_t
             {
                 const auto current_time = Clock::now();
-                if (waitPoint>current_time)
-                {
-                    const auto nanosecondsLeft = std::chrono::duration_cast<std::chrono::nanoseconds>(waitPoint-current_time).count();
-                    if (m_device->waitForSemaphores(m_scratchWaitInfos,!ReturnsBool,nanosecondsLeft)==ISemaphore::WAIT_RESULT::SUCCESS)
-                        return true; // remeber that latch can move back, not signal though
-                }
-                //
-
-                return false;
+                if (current_time>=waitPoint)
+                    return 0;
+                return std::chrono::duration_cast<std::chrono::nanoseconds>(waitPoint-current_time).count();
             };
 
-            if constexpr (ReturnsBool)
-            {
-                return 600;
-            }
-            else
+            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)),bool>;
+            constexpr bool WaitAll = !ReturnsBool;
+
+            uint32_t sum = 0;
+            do
             {
-                if (singleSemaphoreWait(timeout_time))
+                auto uniqueValueEstimate = 1;
+                // `waitsToPerform` isn't very conservative, it doesn't mean there are no latched events
+                // instead it means that  there is no point waiting with the device on the semaphore
+                // because the value we're about to wait for was already attained.
+                bool waitsToPerform = false;
+                // first gather all the wait values if there's time to even perform a wait
+                if (nanosecondsLeft(timeout_time))
+                for (auto it=m_timelines.begin(); it!=m_timelines.end(); )
                 {
-                    clear();
-                    return 0;
+                    // will return 0 for an empty event list
+                    const auto waitVal = it->getWaitValue(WaitAll);
+                    if (waitVal)
+                    {
+                        // need to fill all waits anyway even if its redudant
+                        m_scratchWaitInfos[it->waitInfoIx].value = waitVal;
+                        // remeber that latch can move back, not the signal though
+                        if (waitVal>it->handler->m_greatestSignal)
+                        {
+                            uniqueValueEstimate = core::max(core::min(it->handler->m_cb.size(),it->handler->m_greatestSignal-it->handler->m_greatestLatch),uniqueValueEstimate);
+                            waitsToPerform = true;
+                        }
+                        it++;
+                    }
+                    else
+                        it = eraseTimeline(it);
                 }
-            }
 
-            return 455;
+                bool allReady = false;
+                if (waitsToPerform)
+                {
+                    const std::chrono::time_point<Clock> singleWaitTimePt((Clock::now().time_since_epoch()*(uniqueValueEstimate-1u)+timeout_time.time_since_epoch())/uniqueValueEstimate);
+                    if (const auto nano = nanosecondsLeft(WaitAll ? timeout_time:singleWaitTimePt))
+                    if (m_device->waitForSemaphores(m_scratchWaitInfos,WaitAll,nano)==ISemaphore::WAIT_RESULT::SUCCESS)
+                        allReady = WaitAll || m_scratchWaitInfos.size()==1;
+                }
+ 
+                sum = 0;
+                bool bailed = false;
+                for (auto it=m_timelines.begin(); it!=m_timelines.end(); )
+                {
+                    auto* handler = it->handler;
+                    // only if we waited for all semaphores, we can just set their greatest signal value to the value we awaited
+                    handler->m_greatestSignal = allReady ? it->getWaitValue(WaitAll):handler->getSemaphore()->getCounterValue();
+                    if (bailed)
+                        sum += handler->count();
+                    else
+                    {
+                        const auto local = handler->poll_impl<false>(std::forward<Args>(args)...);
+                        bailed = local.bailed;
+                        // if don't have any events left, remove the timeline
+                        if (local.eventsLeft)
+                        {
+                            sum += local.eventsLeft;
+                            it++;
+                        }
+                        // but there's a fast path at the end
+                        else if (ReturnsBool || !allReady)
+                            it = eraseTimeline(it);
+                    }
+                }
+                // ultra fast path for non-bailing code when everything was covered by a single wait
+                if (WaitAll && allReady)
+                    clear();
+            } while (sum && Clock::now()<timeout_time);
+            return sum;
         }
 
         inline void abortAll()
@@ -329,6 +367,17 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
     private:
         struct STimeline
         {
+            inline uint64_t getWaitValue(const bool waitAll) const
+            {
+                if (handler->m_cb.empty())
+                    return 0ull;
+                // following same assumptions as the single-timeline case
+                if (waitAll)
+                    return handler->m_greatestLatch;
+                else
+                    return handler->m_cb.front().geSemaValue;
+            }
+
             inline auto operator<=>(const STimeline& rhs) const
             {
                 return handler->getSemaphore()-rhs.handler->getSemaphore();

From d7690be686496be53571468c3e465424d1848496 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 22:17:37 +0100
Subject: [PATCH 21/62] fix KHR function loading bugs

---
 src/nbl/video/CVulkanCommandBuffer.cpp | 8 ++++----
 src/nbl/video/CVulkanSemaphore.cpp     | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index af090c92c3..188ca33595 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -153,13 +153,13 @@ bool CVulkanCommandBuffer::setEvent_impl(IEvent* const _event, const SEventDepen
         return false;
 
     auto info = fill(memoryBarriers.data(),bufferBarriers.data(),imageBarriers.data(),depInfo);
-    getFunctionTable().vkCmdSetEvent2KHR(m_cmdbuf,static_cast<CVulkanEvent*>(_event)->getInternalObject(),&info);
+    getFunctionTable().vkCmdSetEvent2(m_cmdbuf,static_cast<CVulkanEvent*>(_event)->getInternalObject(),&info);
     return true;
 }
 
 bool CVulkanCommandBuffer::resetEvent_impl(IEvent* const _event, const core::bitflag<stage_flags_t> stageMask)
 {
-    getFunctionTable().vkCmdResetEvent2KHR(m_cmdbuf,static_cast<CVulkanEvent*>(_event)->getInternalObject(),getVkPipelineStageFlagsFromPipelineStageFlags(stageMask));
+    getFunctionTable().vkCmdResetEvent2(m_cmdbuf,static_cast<CVulkanEvent*>(_event)->getInternalObject(),getVkPipelineStageFlagsFromPipelineStageFlags(stageMask));
     return true;
 }
 
@@ -196,7 +196,7 @@ bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* co
         bufBarrierCount += infos[i].bufferMemoryBarrierCount;
         imgBarrierCount += infos[i].imageMemoryBarrierCount;
     }
-    getFunctionTable().vkCmdWaitEvents2KHR(m_cmdbuf,eventCount,events.data(),infos.data());
+    getFunctionTable().vkCmdWaitEvents2(m_cmdbuf,eventCount,events.data(),infos.data());
     return true;
 }
 
@@ -553,7 +553,7 @@ bool CVulkanCommandBuffer::endQuery_impl(IQueryPool* const queryPool, const uint
 
 bool CVulkanCommandBuffer::writeTimestamp_impl(const asset::PIPELINE_STAGE_FLAGS pipelineStage, IQueryPool* const queryPool, const uint32_t query)
 {
-    getFunctionTable().vkCmdWriteTimestamp2KHR(m_cmdbuf, getVkPipelineStageFlagsFromPipelineStageFlags(pipelineStage), static_cast<CVulkanQueryPool*>(queryPool)->getInternalObject(), query);
+    getFunctionTable().vkCmdWriteTimestamp2(m_cmdbuf, getVkPipelineStageFlagsFromPipelineStageFlags(pipelineStage), static_cast<CVulkanQueryPool*>(queryPool)->getInternalObject(), query);
     return true;
 }
 
diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp
index d3dbce8e12..071c4b2843 100644
--- a/src/nbl/video/CVulkanSemaphore.cpp
+++ b/src/nbl/video/CVulkanSemaphore.cpp
@@ -15,7 +15,7 @@ uint64_t CVulkanSemaphore::getCounterValue() const
 {
 	uint64_t retval = 0u;
 	const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
-	vulkanDevice->getFunctionTable()->vk.vkGetSemaphoreCounterValueKHR(vulkanDevice->getInternalObject(), m_semaphore, &retval);
+	vulkanDevice->getFunctionTable()->vk.vkGetSemaphoreCounterValue(vulkanDevice->getInternalObject(), m_semaphore, &retval);
 	return retval;
 }
 
@@ -26,7 +26,7 @@ void CVulkanSemaphore::signal(const uint64_t value)
 	info.value = value;
 
 	const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
-	vulkanDevice->getFunctionTable()->vk.vkSignalSemaphoreKHR(vulkanDevice->getInternalObject(), &info);
+	vulkanDevice->getFunctionTable()->vk.vkSignalSemaphore(vulkanDevice->getInternalObject(), &info);
 }
 
 void CVulkanSemaphore::setObjectDebugName(const char* label) const

From 13ff02a6abae79a2e9c21fa1581b744b9774ac51 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 22:41:57 +0100
Subject: [PATCH 22/62] fix some nasty bug in TimelineEventHandlerST

---
 include/nbl/video/TimelineEventHandlers.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 0f4e7015a7..33832c0605 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -20,7 +20,10 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
     public:
         // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent.
         inline TimelineEventHandlerST(core::smart_refctd_ptr<const ISemaphore>&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) :
-            m_sema(std::move(sema)), m_greatestSignal(m_sema->getCounterValue()), m_greatestLatch(0) {}
+            m_sema(std::move(sema)), m_greatestLatch(0)
+        {
+            m_greatestSignal = m_sema->getCounterValue();
+        }
         // If you don't want to deadlock here, look into the `abort*` family of methods
         ~TimelineEventHandlerST()
         {
@@ -189,11 +192,11 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         {
             return [&](FunctorValuePair& p) -> bool
             {
-                if (p.geSemaValue>m_greatestSignal)
+                if (bailed || p.geSemaValue>m_greatestSignal)
                     return false;
-                const bool last_bailed = bailed;
+                const bool bailedBefore = bailed;
                 bailed = p.func(std::forward<Args>(args)...);
-                return !last_bailed;
+                return !bailedBefore;
             };
         }
 
@@ -201,7 +204,7 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         inline PollResult poll_impl(Args&&... args)
         {
             PollResult retval = {};
-            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)), bool>;
+            constexpr bool ReturnsBool = std::is_same_v<decltype(std::declval<Functor>()(std::forward<Args>(args)...)),bool>;
             if constexpr (ReturnsBool)
                 retval.eventsLeft = for_each_popping<QueryCounter>(constructBailing(retval.bailed, std::forward<Args>(args)...));
             else
@@ -223,6 +226,8 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             clear();
         }
 
+        inline ILogicalDevice* getLogicalDevice() const {return m_device.get();}
+
         inline const auto& getTimelines() const {return m_timelines;}
 
         // all the members are counteparts of the single timeline version
@@ -348,6 +353,8 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                         // but there's a fast path at the end
                         else if (ReturnsBool || !allReady)
                             it = eraseTimeline(it);
+                        else
+                            it++;
                     }
                 }
                 // ultra fast path for non-bailing code when everything was covered by a single wait

From fabc999ce72859d6c3f37226a5925424aeef32e8 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 23:14:42 +0100
Subject: [PATCH 23/62] Take the TimelineEventHandlerST for a first spin with
 ICommandPoolCache

---
 .../nbl/video/utilities/ICommandPoolCache.h   | 43 +++++++++-----
 .../nbl/video/utilities/IDescriptorSetCache.h | 58 +++++++++++++------
 src/nbl/CMakeLists.txt                        |  1 -
 src/nbl/video/utilities/ICommandPoolCache.cpp | 12 +---
 .../video/utilities/IDescriptorSetCache.cpp   | 25 --------
 5 files changed, 68 insertions(+), 71 deletions(-)
 delete mode 100644 src/nbl/video/utilities/IDescriptorSetCache.cpp

diff --git a/include/nbl/video/utilities/ICommandPoolCache.h b/include/nbl/video/utilities/ICommandPoolCache.h
index f86ebde930..6f384aa60b 100644
--- a/include/nbl/video/utilities/ICommandPoolCache.h
+++ b/include/nbl/video/utilities/ICommandPoolCache.h
@@ -8,6 +8,7 @@
 #include "nbl/asset/asset.h"
 
 #include "nbl/video/IGPUCommandPool.h"
+#include "nbl/video/TimelineEventHandlers.h"
 
 
 namespace nbl::video
@@ -18,7 +19,22 @@ class ICommandPoolCache : public core::IReferenceCounted
 	public:
 		using CommandPoolAllocator = core::PoolAddressAllocatorST<uint32_t>;
 
-		NBL_API2 ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag<IGPUCommandPool::CREATE_FLAGS> _flags, const uint32_t capacity);
+		//
+		static inline core::smart_refctd_ptr<ICommandPoolCache> create(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t queueFamilyIx, const core::bitflag<IGPUCommandPool::CREATE_FLAGS> _flags, const uint32_t capacity)
+		{
+			auto cache = new core::smart_refctd_ptr<IGPUCommandPool>[capacity];
+			if (!cache)
+				return nullptr;
+
+			for (auto i = 0u; i<capacity; i++)
+				cache[i] = device->createCommandPool(queueFamilyIx,_flags);
+
+			void* reserved = malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u));
+			if (!reserved)
+				return nullptr;
+
+			return core::smart_refctd_ptr<ICommandPoolCache>(new ICommandPoolCache(std::move(device),cache,capacity,reserved),core::dont_grab);
+		}
 
 		//
 		inline uint32_t getCapacity() const {return m_cmdPoolAllocator.get_total_size();}
@@ -32,33 +48,26 @@ class ICommandPoolCache : public core::IReferenceCounted
 			return nullptr;
 		}
 
-#if 0 // TODO: port
 		//
 		inline uint32_t acquirePool()
 		{
-			m_deferredResets.pollForReadyEvents(DeferredCommandPoolResetter::single_poll);
+			m_deferredResets.poll(DeferredCommandPoolResetter::single_poll);
 			return m_cmdPoolAllocator.alloc_addr(1u,1u);
 		}
 
-		// needs to be called before you reset any fences which latch the deferred release
-		inline void poll_all()
-		{
-			m_deferredResets.pollForReadyEvents(DeferredCommandPoolResetter::exhaustive_poll);
-		}
-
 		//
-		inline void releaseSet(ILogicalDevice* device, core::smart_refctd_ptr<IGPUFence>&& fence, const uint32_t poolIx)
+		inline void releasePool(const ISemaphore::SWaitInfo& futureWait, const uint32_t poolIx)
 		{
 			if (poolIx==invalid_index)
 				return;
 			
-			if (fence)
-				m_deferredResets.addEvent(GPUEventWrapper(device,std::move(fence)),DeferredCommandPoolResetter(this,poolIx));
+			if (futureWait.semaphore)
+				m_deferredResets.latch(futureWait,DeferredCommandPoolResetter(this,poolIx));
 			else
 				releaseSet(poolIx);
 		}
 
-		// only public because GPUDeferredEventHandlerST needs to know about it
+		// only public because MultiTimelineEventHandlerST needs to know about it
 		class DeferredCommandPoolResetter
 		{
 				ICommandPoolCache* m_cache;
@@ -106,13 +115,15 @@ class ICommandPoolCache : public core::IReferenceCounted
 
 				NBL_API2 void operator()();
 		};
-#endif
 
 	protected:
 		friend class DeferredCommandPoolResetter;
+		inline ICommandPoolCache(core::smart_refctd_ptr<ILogicalDevice>&& device, core::smart_refctd_ptr<IGPUCommandPool>* cache, const uint32_t capacity, void* reserved) :
+			m_cache(cache),	m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u), m_deferredResets(std::move(device)) {}
 		inline virtual ~ICommandPoolCache()
 		{
-//			m_deferredResets.cullEvents(0u);
+			// normally the dtor would do this, but we need all the events to run before we delete the storage they reference
+			while (m_deferredResets.wait(std::chrono::steady_clock::now()+std::chrono::milliseconds(1))) {}
 			free(m_reserved);
 			delete[] m_cache;
 		}
@@ -122,7 +133,7 @@ class ICommandPoolCache : public core::IReferenceCounted
 		core::smart_refctd_ptr<IGPUCommandPool>* m_cache;
 		void* m_reserved;
 		CommandPoolAllocator m_cmdPoolAllocator;
-//		GPUDeferredEventHandlerST<DeferredCommandPoolResetter> m_deferredResets;
+		MultiTimelineEventHandlerST<DeferredCommandPoolResetter> m_deferredResets;
 };
 
 }
diff --git a/include/nbl/video/utilities/IDescriptorSetCache.h b/include/nbl/video/utilities/IDescriptorSetCache.h
index c2e04906f9..c0ce5a0697 100644
--- a/include/nbl/video/utilities/IDescriptorSetCache.h
+++ b/include/nbl/video/utilities/IDescriptorSetCache.h
@@ -14,13 +14,33 @@
 namespace nbl::video
 {
 
-#if 0 // TODO: port
 class IDescriptorSetCache : public core::IReferenceCounted
 {
 	public:
 		using DescSetAllocator = core::PoolAddressAllocatorST<uint32_t>;
 
-		IDescriptorSetCache(ILogicalDevice* device, core::smart_refctd_ptr<IDescriptorPool>&& _descPool, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _canonicalLayout);
+		//
+		static inline core::smart_refctd_ptr<IDescriptorSetCache> create(
+			const uint32_t capacity, const IDescriptorPool::E_CREATE_FLAGS flags,
+			core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& canonicalLayout
+		)
+		{
+			if (capacity==0 || !canonicalLayout)
+				return nullptr;
+			void* reserved = malloc(DescSetAllocator::reserved_size(1u,capacity,1u));
+			if (!reserved)
+				return nullptr;
+			auto* cache = new core::smart_refctd_ptr<IGPUDescriptorSet>[capacity];
+			if (!cache)
+				return nullptr;
+			auto device = const_cast<ILogicalDevice*>(canonicalLayout->getOriginDevice());
+			if (!device)
+				return nullptr;
+			auto pool = device->createDescriptorPoolForDSLayouts(flags,{&canonicalLayout.get(),1},&capacity);
+			if (!pool)
+				return nullptr;
+			return core::smart_refctd_ptr<IDescriptorSetCache>(new IDescriptorSetCache(std::move(pool),std::move(canonicalLayout),cache,reserved),core::dont_grab);
+		}
 
 		//
 		inline uint32_t getCapacity() const {return m_descPool->getCapacity();}
@@ -40,26 +60,20 @@ class IDescriptorSetCache : public core::IReferenceCounted
 		//
 		inline uint32_t acquireSet()
 		{
-			m_deferredReclaims.pollForReadyEvents(DeferredDescriptorSetReclaimer::single_poll);
+			m_deferredReclaims.poll(DeferredDescriptorSetReclaimer::single_poll);
 			return m_setAllocator.alloc_addr(1u,1u);
 		}
 
-		// needs to be called before you reset any fences which latch the deferred release
-		inline void poll_all()
-		{
-			m_deferredReclaims.pollForReadyEvents(DeferredDescriptorSetReclaimer::exhaustive_poll);
-		}
-
 		//
-		inline void releaseSet(ILogicalDevice* device, core::smart_refctd_ptr<IGPUFence>&& fence, const uint32_t setIx)
+		inline void releaseSet(const ISemaphore::SWaitInfo& futureWait, const uint32_t setIx)
 		{
 			if (setIx==invalid_index)
 				return;
 
-			m_deferredReclaims.addEvent(GPUEventWrapper(device,std::move(fence)),DeferredDescriptorSetReclaimer(this,setIx));
+			m_deferredReclaims.latch(futureWait,DeferredDescriptorSetReclaimer(this,setIx));
 		}
 
-		// only public because GPUDeferredEventHandlerST needs to know about it
+		// only public because MultiTimelineEventHandlerST needs to know about it
 		class DeferredDescriptorSetReclaimer
 		{
 				IDescriptorSetCache* m_cache;
@@ -70,7 +84,7 @@ class IDescriptorSetCache : public core::IReferenceCounted
 				{
 				}
 				DeferredDescriptorSetReclaimer(const DeferredDescriptorSetReclaimer& other) = delete;
-				DeferredDescriptorSetReclaimer(DeferredDescriptorSetReclaimer&& other) : m_cache(nullptr), m_setIx(DescSetAllocator::invalid_address)
+				inline DeferredDescriptorSetReclaimer(DeferredDescriptorSetReclaimer&& other) : m_cache(nullptr), m_setIx(DescSetAllocator::invalid_address)
 				{
 					this->operator=(std::forward<DeferredDescriptorSetReclaimer>(other));
 				}
@@ -116,10 +130,19 @@ class IDescriptorSetCache : public core::IReferenceCounted
 
 	protected:
 		friend class DeferredDescriptorSetReclaimer;
-		IDescriptorSetCache(ILogicalDevice* device, const uint32_t capacity);
-		virtual ~IDescriptorSetCache()
+		inline IDescriptorSetCache(
+			core::smart_refctd_ptr<IDescriptorPool>&& pool,
+			core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& canonicalLayout,
+			core::smart_refctd_ptr<IGPUDescriptorSet>* cache,
+			void* const reserved
+		) : m_descPool(std::move(pool)), m_canonicalLayout(std::move(canonicalLayout)), m_cache(cache),
+			m_reserved(reserved), m_setAllocator(m_reserved,0u,0u,1u,m_descPool->getCapacity(),1u),
+			m_deferredReclaims(core::smart_refctd_ptr<ILogicalDevice>(const_cast<ILogicalDevice*>(m_descPool->getOriginDevice())))
+		{}
+		virtual inline ~IDescriptorSetCache()
 		{
-			m_deferredReclaims.cullEvents(0u);
+			// normally the dtor would do this, but we need all the events to run before we delete the storage they reference
+			while (m_deferredReclaims.wait(std::chrono::steady_clock::now()+std::chrono::microseconds(100))) {}
 			free(m_reserved);
 			delete[] m_cache;
 		}
@@ -129,9 +152,8 @@ class IDescriptorSetCache : public core::IReferenceCounted
 		core::smart_refctd_ptr<IGPUDescriptorSet>* m_cache;
 		void* m_reserved;
 		DescSetAllocator m_setAllocator;
-		GPUDeferredEventHandlerST<DeferredDescriptorSetReclaimer> m_deferredReclaims;
+		MultiTimelineEventHandlerST<DeferredDescriptorSetReclaimer> m_deferredReclaims;
 };
-#endif
 
 }
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 8297a09692..84c6b6c7ae 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -260,7 +260,6 @@ set(NBL_VIDEO_SOURCES
 	${NBL_ROOT_PATH}/src/nbl/video/alloc/CSimpleBufferAllocator.cpp
 
 # Utilities
-	${NBL_ROOT_PATH}/src/nbl/video/utilities/IDescriptorSetCache.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/ICommandPoolCache.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IUtilities.cpp
diff --git a/src/nbl/video/utilities/ICommandPoolCache.cpp b/src/nbl/video/utilities/ICommandPoolCache.cpp
index 4c38fb5dec..915a23c068 100644
--- a/src/nbl/video/utilities/ICommandPoolCache.cpp
+++ b/src/nbl/video/utilities/ICommandPoolCache.cpp
@@ -6,26 +6,16 @@ using namespace nbl;
 using namespace video;
 
 
-ICommandPoolCache::ICommandPoolCache(ILogicalDevice* const device, const uint32_t queueFamilyIx, const core::bitflag<IGPUCommandPool::CREATE_FLAGS> _flags, const uint32_t capacity)
-	: m_reserved(malloc(CommandPoolAllocator::reserved_size(1u,capacity,1u))), m_cmdPoolAllocator(m_reserved,0u,0u,1u,capacity,1u)//, m_deferredResets()
-{
-	m_cache = new core::smart_refctd_ptr<IGPUCommandPool>[capacity];
-	for (auto i=0u; i<getCapacity(); i++)
-		m_cache[i] = device->createCommandPool(queueFamilyIx,_flags);
-}
-
 void ICommandPoolCache::releaseSet(const uint32_t poolIx)
 {
 	m_cache[poolIx]->reset();
 	m_cmdPoolAllocator.free_addr(poolIx,1);
 }
 
-#if 0
 void ICommandPoolCache::DeferredCommandPoolResetter::operator()()
 {
 	#ifdef _NBL_DEBUG
 	assert(m_cache && m_poolIx<m_cache->getCapacity());
 	#endif // _NBL_DEBUG
 	m_cache->releaseSet(m_poolIx);
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/src/nbl/video/utilities/IDescriptorSetCache.cpp b/src/nbl/video/utilities/IDescriptorSetCache.cpp
deleted file mode 100644
index d2025bd3f2..0000000000
--- a/src/nbl/video/utilities/IDescriptorSetCache.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "nbl/video/IPhysicalDevice.h"
-#include "nbl/video/ILogicalDevice.h"
-#include "nbl/video/utilities/IDescriptorSetCache.h"
-
-using namespace nbl;
-using namespace video;
-
-
-#if 0 // TODO: port
-IDescriptorSetCache::IDescriptorSetCache(ILogicalDevice* device, const uint32_t capacity)
-	: m_descPool(), m_canonicalLayout(), m_reserved(malloc(DescSetAllocator::reserved_size(1u,capacity,1u))),
-	m_setAllocator(m_reserved,0u,0u,1u,capacity,1u), m_deferredReclaims()
-{
-	m_cache = new core::smart_refctd_ptr<IGPUDescriptorSet>[capacity];
-	std::fill_n(m_cache,capacity,nullptr);
-}
-
-IDescriptorSetCache::IDescriptorSetCache(ILogicalDevice* device, core::smart_refctd_ptr<IDescriptorPool>&& _descPool, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _canonicalLayout) : IDescriptorSetCache(device,_descPool->getCapacity())
-{
-	m_descPool = std::move(_descPool);
-	m_canonicalLayout = std::move(_canonicalLayout);
-	for (auto i=0u; i<getCapacity(); i++)
-		m_cache[i] = m_descPool->createDescriptorSet(core::smart_refctd_ptr(m_canonicalLayout));
-}
-#endif
\ No newline at end of file

From 0eb8e9a096ac2202d28e1b476235b64d9ee01038 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Mon, 8 Jan 2024 23:29:58 +0100
Subject: [PATCH 24/62] turns out its quite easy to port the other utilities to
 the new MultiTimelineEventHandlerST

---
 include/nbl/video/ISemaphore.h                |  4 +--
 include/nbl/video/TimelineEventHandlers.h     |  6 ++++
 .../alloc/CAsyncSingleBufferSubAllocator.h    | 28 +++++++++----------
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h
index f16fa86baf..0b14590e83 100644
--- a/include/nbl/video/ISemaphore.h
+++ b/include/nbl/video/ISemaphore.h
@@ -30,8 +30,8 @@ class ISemaphore : public IBackendObject
         // but don't want to pollute ILogicalDevice with lots of enums and structs
         struct SWaitInfo
         {
-            const ISemaphore* semaphore;
-            uint64_t value;
+            const ISemaphore* semaphore = nullptr;
+            uint64_t value = 0;
         };
         enum class WAIT_RESULT : uint8_t
         {
diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 33832c0605..902f82c8aa 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -55,6 +55,12 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
             return poll_impl<true>(std::forward<Args>(args)...);
         }
 
+        template<class Clock=std::chrono::steady_clock>
+        static inline Clock::time_point default_wait()
+        {
+            return Clock::now()+std::chrono::microseconds(50);
+        }
+
         template<class Clock, class Duration=typename Clock::duration, typename... Args>
         inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time, Args&&... args)
         {
diff --git a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
index c4d80cc7dc..f7b95464a7 100644
--- a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
+++ b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
@@ -1,19 +1,20 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_VIDEO_C_ASYNC_SINGLE_BUFFER_SUB_ALLOCATOR_H_
 #define _NBL_VIDEO_C_ASYNC_SINGLE_BUFFER_SUB_ALLOCATOR_H_
 
+
 #include "nbl/core/alloc/GeneralpurposeAddressAllocator.h"
+#include "nbl/video/alloc/CSingleBufferSubAllocator.h"
+#include "nbl/video/TimelineEventHandlers.h"
 
 #include <mutex>
 
-#include "nbl/video/alloc/CSingleBufferSubAllocator.h"
 
 namespace nbl::video
 {
 
-#if 0 // TODO: port
 namespace impl
 {
 // HostAllocator allocates both reserved space and the space needed for variable length records on the DeferredFreeFunctor
@@ -134,7 +135,7 @@ class CAsyncSingleBufferSubAllocator
             std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
             assert(tLock.owns_lock());
             #endif // _NBL_DEBUG
-            return deferredFrees.cullEvents(0u);
+            return deferredFrees.poll();
         }
 
         //! Returns max possible currently allocatable single allocation size, without having to wait for GPU more
@@ -146,7 +147,7 @@ class CAsyncSingleBufferSubAllocator
             #endif // _NBL_DEBUG
             size_type valueToStopAt = getAddressAllocator().min_size()*3u; // padding, allocation, more padding = 3u
             // we don't actually want or need to poll all possible blocks to free, only first few
-            deferredFrees.pollForReadyEvents(valueToStopAt);
+            deferredFrees.poll(valueToStopAt);
             return getAddressAllocator().max_size();
         }
 
@@ -155,7 +156,7 @@ class CAsyncSingleBufferSubAllocator
         template<typename... Args>
         inline size_type multi_allocate(uint32_t count, Args&&... args) noexcept
         {
-            return multi_alloc(GPUEventWrapper::default_wait(),count,std::forward<Args>(args)...);
+            return multi_alloc(decltype(deferredFrees)::default_wait(),count,std::forward<Args>(args)...);
         }
         //! attempt to allocate, if fail (presumably because of fragmentation), then keep trying till timeout is reached
         template<class Clock=typename std::chrono::steady_clock, typename... Args>
@@ -174,7 +175,7 @@ class CAsyncSingleBufferSubAllocator
             // then try to wait at least once and allocate
             do
             {
-                deferredFrees.waitUntilForReadyEvents(maxWaitPoint,unallocatedSize);
+                deferredFrees.wait(maxWaitPoint,unallocatedSize);
 
                 unallocatedSize = try_multi_alloc(args...);
                 if (!unallocatedSize)
@@ -185,13 +186,13 @@ class CAsyncSingleBufferSubAllocator
         }
 
         //!
-        inline void multi_deallocate(core::smart_refctd_ptr<IGPUFence>&& fence, DeferredFreeFunctor&& functor) noexcept
+        inline void multi_deallocate(const ISemaphore::SWaitInfo& futureWait, DeferredFreeFunctor&& functor) noexcept
         {
             #ifdef _NBL_DEBUG
             std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
             assert(tLock.owns_lock());
             #endif // _NBL_DEBUG
-            deferredFrees.addEvent(GPUEventWrapper(const_cast<ILogicalDevice*>(m_composed.getBuffer()->getOriginDevice()),std::move(fence)),std::move(functor));
+            deferredFrees.latch(futureWait,std::move(functor));
         }
         inline void multi_deallocate(uint32_t count, const value_type* addr, const size_type* bytes) noexcept
         {
@@ -203,17 +204,17 @@ class CAsyncSingleBufferSubAllocator
         }
         // TODO: improve signature of this function in the future
         template<typename T=core::IReferenceCounted>
-        inline void multi_deallocate(uint32_t count, const value_type* addr, const size_type* bytes, core::smart_refctd_ptr<IGPUFence>&& fence, const T*const *const objectsToDrop=nullptr) noexcept
+        inline void multi_deallocate(uint32_t count, const value_type* addr, const size_type* bytes, const ISemaphore::SWaitInfo& futureWait, const T*const *const objectsToDrop=nullptr) noexcept
         {
-            if (fence)
-                multi_deallocate(std::move(fence),DeferredFreeFunctor(&m_composed,count,addr,bytes,objectsToDrop));
+            if (futureWait.semaphore)
+                multi_deallocate(futureWait,DeferredFreeFunctor(&m_composed,count,addr,bytes,objectsToDrop));
             else
                 multi_deallocate(count,addr,bytes);
         }
 
     protected:
         Composed m_composed;
-        GPUDeferredEventHandlerST<DeferredFreeFunctor> deferredFrees;
+        MultiTimelineEventHandlerST<DeferredFreeFunctor> deferredFrees;
 
         template<typename... Args>
         inline value_type try_multi_alloc(uint32_t count, value_type* outAddresses, const size_type* byteSizes, const Args&... args) noexcept
@@ -246,7 +247,6 @@ class CAsyncSingleBufferSubAllocatorST final : public core::IReferenceCounted, p
         template<typename... Args>
         CAsyncSingleBufferSubAllocatorST(Args&&... args) : Base(std::forward<Args>(args)...) {}
 };
-#endif
 
 //MT version?
 

From e59408dcc3787839af326c9f9495f562892e9ba0 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Tue, 9 Jan 2024 00:17:31 +0100
Subject: [PATCH 25/62] remove more unused stuff

---
 include/IVideoCapabilityReporter.h            | 67 ----------------
 include/IVideoDriver.h                        | 34 --------
 .../nbl/video/alloc/CSimpleBufferAllocator.h  | 40 ----------
 .../video/alloc/CStreamingBufferAllocator.h   | 67 ----------------
 .../nbl/video/alloc/GPUMemoryAllocatorBase.h  | 27 -------
 .../alloc/HostDeviceMirrorBufferAllocator.h   | 69 ----------------
 .../video/alloc/StreamingGPUBufferAllocator.h | 78 -------------------
 include/nbl/video/declarations.h              |  8 +-
 src/nbl/CMakeLists.txt                        |  3 -
 .../video/alloc/CSimpleBufferAllocator.cpp    | 23 ------
 10 files changed, 3 insertions(+), 413 deletions(-)
 delete mode 100644 include/IVideoCapabilityReporter.h
 delete mode 100644 include/IVideoDriver.h
 delete mode 100644 include/nbl/video/alloc/CSimpleBufferAllocator.h
 delete mode 100644 include/nbl/video/alloc/CStreamingBufferAllocator.h
 delete mode 100644 include/nbl/video/alloc/GPUMemoryAllocatorBase.h
 delete mode 100644 include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h
 delete mode 100644 include/nbl/video/alloc/StreamingGPUBufferAllocator.h
 delete mode 100644 src/nbl/video/alloc/CSimpleBufferAllocator.cpp

diff --git a/include/IVideoCapabilityReporter.h b/include/IVideoCapabilityReporter.h
deleted file mode 100644
index 39fc6a5f54..0000000000
--- a/include/IVideoCapabilityReporter.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_I_VIDEO_CAPABILITY_REPORTER_H_INCLUDED__
-#define __NBL_I_VIDEO_CAPABILITY_REPORTER_H_INCLUDED__
-
-
-
-namespace nbl
-{
-namespace video
-{
-	//! .
-	class NBL_FORCE_EBO IVideoCapabilityReporter
-	{
-	public:
-		//! Get type of video driver
-		/** \return Type of driver. */
-		//virtual E_DRIVER_TYPE getDriverType() const =0;
-
-        //! enumeration for querying features of the video driver.
-        enum E_DRIVER_FEATURE
-        {
-            //! Supports Alpha To Coverage (always in OpenGL 4.3+, Vulkan Mobile GPUs don't)
-            EDF_ALPHA_TO_COVERAGE = 0,
-
-            //! Supports geometry shaders (always in OpenGL 4.3+, Vulkan Mobile GPUs don't)
-            EDF_GEOMETRY_SHADER,
-
-            //! Supports tessellation shaders (always in OpenGL 4.3+, Vulkan Mobile GPUs don't)
-            EDF_TESSELLATION_SHADER,
-
-            //! If we can attach a stencil only texture to an FBO, if not must use Depth+Stencil
-            EDF_STENCIL_ONLY_TEXTURE,
-
-            //! Whether we can get gl_DrawIDARB in GLSL (if not see https://www.g-truc.net/post-0518.html for ways to circumvent)
-            EDF_SHADER_DRAW_PARAMS,
-
-            //! Whether we can force overlapping pixels to not rasterize in parallel, INTEL_fragment_shader_ordering, NV_fragment_shader_interlock or ARB_fragment_shader_interlock
-            EDF_FRAGMENT_SHADER_INTERLOCK,
-
-            //! Whether textures can be used by their hardware handles bindlessly (without specifying them in descriptor sets) TODO: What to do about this?
-            EDF_BINDLESS_TEXTURE,
-
-            //! Whether we can index samplers dynamically in a shader TODO: only in Vulkan or NV_gpu_shader5
-            EDF_DYNAMIC_SAMPLER_INDEXING,
-
-            //! A way to pass information between fragment shader invocations covering the same pixel
-            EDF_INPUT_ATTACHMENTS,
-
-            //other feature ideas are; bindless buffers, sparse texture, sparse texture 2
-
-            //! Only used for counting the elements of this enum
-            EDF_COUNT
-        };
-
-        virtual uint16_t retrieveDisplayRefreshRate() const { return 0u; }
-		virtual uint32_t getMaxTextureBindingsCompute() const { return 0u; }
-	};
-
-} // end namespace video
-} // end namespace nbl
-
-
-#endif
-
diff --git a/include/IVideoDriver.h b/include/IVideoDriver.h
deleted file mode 100644
index 5a4a5d56ed..0000000000
--- a/include/IVideoDriver.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
-// For conditions of distribution and use, see copyright notice in nabla.h
-// See the original file in irrlicht source for authors
-
-#ifndef __NBL_I_VIDEO_DRIVER_H_INCLUDED__
-#define __NBL_I_VIDEO_DRIVER_H_INCLUDED__
-
-
-namespace nbl
-{
-namespace video
-{
-#if 0
-	//! Legacy and deprecated system
-	class IVideoDriver : public IDriver
-	{
-	public:
-		//!
-		virtual void issueGPUTextureBarrier() =0;
-
-		//! Event handler for resize events. Only used by the engine internally.
-		/** Used to notify the driver that the window was resized.
-		Usually, there is no need to call this method. */
-		virtual void OnResize(const core::dimension2d<uint32_t>& size) =0;
-
-	};
-#endif
-
-} // end namespace video
-} // end namespace nbl
-
-
-#endif
diff --git a/include/nbl/video/alloc/CSimpleBufferAllocator.h b/include/nbl/video/alloc/CSimpleBufferAllocator.h
deleted file mode 100644
index 97cbe41b86..0000000000
--- a/include/nbl/video/alloc/CSimpleBufferAllocator.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_VIDEO_C_SIMPLE_BUFFER_ALLOCATOR_H_
-#define _NBL_VIDEO_C_SIMPLE_BUFFER_ALLOCATOR_H_
-
-#include "nbl/video/IDeviceMemoryAllocator.h"
-#include "nbl/video/alloc/IBufferAllocator.h"
-
-namespace nbl::video
-{
-
-class CSimpleBufferAllocator : public IBufferAllocator
-{
-    core::smart_refctd_ptr<ILogicalDevice> m_device;
-    uint32_t m_memoryTypesToUse;
-
-  public:
-    using value_type = asset::SBufferBinding<IGPUBuffer>;
-
-    CSimpleBufferAllocator(core::smart_refctd_ptr<ILogicalDevice>&& _device, const uint32_t _memoryTypesToUse) : m_device(std::move(_device)), m_memoryTypesToUse(_memoryTypesToUse) {}
-    virtual ~CSimpleBufferAllocator() = default;
-
-    inline ILogicalDevice* getDevice() {return m_device.get();}
-
-    value_type allocate(
-        IGPUBuffer::SCreationParams&& creationParams,
-        const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE
-    );
-
-    inline void deallocate(value_type& allocation)
-    {
-        allocation = {IDeviceMemoryAllocator::SAllocation::InvalidMemoryOffset,nullptr};
-    }
-};
-
-}
-
-#endif
-
diff --git a/include/nbl/video/alloc/CStreamingBufferAllocator.h b/include/nbl/video/alloc/CStreamingBufferAllocator.h
deleted file mode 100644
index 2811a96f8f..0000000000
--- a/include/nbl/video/alloc/CStreamingBufferAllocator.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H_
-#define _NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H_
-
-#include "nbl/video/alloc/CSimpleBufferAllocator.h"
-
-namespace nbl::video
-{
-
-class CStreamingBufferAllocator : protected CSimpleBufferAllocator
-{
-    public:
-        struct value_type
-        {
-            typename CSimpleBufferAllocator::value_type bufferBinding;
-            uint8_t* ptr;
-        };
-
-        using CSimpleBufferAllocator::CSimpleBufferAllocator;
-        virtual ~CStreamingBufferAllocator() = default;
-
-        inline value_type allocate(IGPUBuffer::SCreationParams&& creationParams, const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags = IDeviceMemoryAllocation::EMAF_NONE)
-        {
-            auto bufferBinding = CSimpleBufferAllocator::allocate(std::move(creationParams),allocateFlags);
-            uint8_t* mappedPtr = nullptr;
-            if (bufferBinding.buffer)
-            {
-                IDeviceMemoryAllocation* const mem = bufferBinding.buffer->getBoundMemory().memory;
-                if (mem->isCurrentlyMapped())
-                {
-                    assert(mem->getMappedRange().offset == 0ull && mem->getMappedRange().length == mem->getAllocationSize());
-                    mappedPtr = reinterpret_cast<uint8_t*>(mem->getMappedPointer());
-                }
-                else
-                {
-                    core::bitflag<IDeviceMemoryAllocation::E_MAPPING_CPU_ACCESS_FLAGS> access(IDeviceMemoryAllocation::EMCAF_NO_MAPPING_ACCESS);
-                    const auto memProps = mem->getMemoryPropertyFlags();
-                    if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_READABLE_BIT))
-                        access |= IDeviceMemoryAllocation::EMCAF_READ;
-                    if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT))
-                        access |= IDeviceMemoryAllocation::EMCAF_WRITE;
-                    assert(access.value);
-                    IDeviceMemoryAllocation::MemoryRange memoryRange = {0ull,mem->getAllocationSize()};
-                    mappedPtr = reinterpret_cast<uint8_t*>(mem->map(memoryRange,access));
-                }
-                if (!mappedPtr)
-                    CSimpleBufferAllocator::deallocate(bufferBinding);
-                mappedPtr += bufferBinding.buffer->getBoundMemory().offset+bufferBinding.offset;
-            }
-            return {std::move(bufferBinding),mappedPtr};
-        }
-
-        inline void deallocate(value_type& allocation)
-        {
-            allocation.ptr = nullptr;
-            auto* mem = allocation.bufferBinding.buffer->getBoundMemory().memory;
-            if (mem->getReferenceCount()==1)
-                mem->unmap();
-            CSimpleBufferAllocator::deallocate(allocation.bufferBinding);
-        }
-};
-
-}
-
-#endif
diff --git a/include/nbl/video/alloc/GPUMemoryAllocatorBase.h b/include/nbl/video/alloc/GPUMemoryAllocatorBase.h
deleted file mode 100644
index d75411763a..0000000000
--- a/include/nbl/video/alloc/GPUMemoryAllocatorBase.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_VIDEO_GPU_MEMORY_ALLOCATOR_BASE_H__
-#define __NBL_VIDEO_GPU_MEMORY_ALLOCATOR_BASE_H__
-
-namespace nbl::video
-{
-
-class ILogicalDevice;
-
-class GPUMemoryAllocatorBase
-{
-    protected:
-        ILogicalDevice* mDriver; // TODO: change to smartpointer backlink (after declarations_and_definitions branch merge)
-
-        GPUMemoryAllocatorBase(ILogicalDevice* inDriver) : mDriver(inDriver) {}
-        virtual ~GPUMemoryAllocatorBase() = default;
-    public:
-        ILogicalDevice*    getDriver() noexcept {return mDriver;}
-};
-
-}
-
-
-#endif
diff --git a/include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h b/include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h
deleted file mode 100644
index 433007b7c9..0000000000
--- a/include/nbl/video/alloc/HostDeviceMirrorBufferAllocator.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_VIDEO_HOST_DEVICE_MIRROR_BUFFER_ALLOCATOR_H__
-#define __NBL_VIDEO_HOST_DEVICE_MIRROR_BUFFER_ALLOCATOR_H__
-
-
-#include "nbl/video/alloc/SimpleGPUBufferAllocator.h"
-
-namespace nbl::video
-{
-
-//class ILogicalDevice;
-
-template<class HostAllocator = core::allocator<uint8_t> >
-class HostDeviceMirrorBufferAllocator : protected SimpleGPUBufferAllocator
-{
-        HostAllocator hostAllocator;
-    public:
-        struct value_type
-        {
-            typename SimpleGPUBufferAllocator::value_type buffer;
-            uint8_t* ptr; // maybe a ICPUBuffer in the future?
-        };
-
-        HostDeviceMirrorBufferAllocator(ILogicalDevice* inDriver);
-        virtual ~HostDeviceMirrorBufferAllocator() = default;
-
-        inline value_type   allocate(size_t bytes, size_t alignment) noexcept
-        {
-            auto buff =  SimpleGPUBufferAllocator::allocate(bytes,alignment);
-            if (!buff)
-                return {nullptr,nullptr};
-            auto hostPtr = hostAllocator.allocate(bytes,alignment);
-            if (!hostPtr)
-            {
-                SimpleGPUBufferAllocator::deallocate(buff);
-                return {nullptr,nullptr};
-            }
-            return {std::move(buff),hostPtr};
-        }
-
-        inline void         deallocate(value_type& allocation) noexcept
-        {
-            hostAllocator.deallocate(allocation.ptr,allocation.buffer->getSize());
-            SimpleGPUBufferAllocator::deallocate(allocation.buffer);
-            allocation.ptr = nullptr;
-        }
-#if 0
-        //to expose base functions again
-        IDriver*   getDriver() noexcept {return SimpleGPUBufferAllocator::getDriver();}
-#endif
-};
-
-
-}
-
-#include "nbl/video/ILogicalDevice.h"
-
-namespace nbl::video
-{
-
-template<class HostAllocator>
-HostDeviceMirrorBufferAllocator<HostAllocator>::HostDeviceMirrorBufferAllocator(ILogicalDevice* inDriver) : SimpleGPUBufferAllocator(inDriver,inDriver->getDeviceLocalGPUMemoryReqs()) {}
-
-}
-
-#endif
diff --git a/include/nbl/video/alloc/StreamingGPUBufferAllocator.h b/include/nbl/video/alloc/StreamingGPUBufferAllocator.h
deleted file mode 100644
index 0c01f6703e..0000000000
--- a/include/nbl/video/alloc/StreamingGPUBufferAllocator.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H__
-#define __NBL_VIDEO_STREAMING_GPUBUFFER_ALLOCATOR_H__
-
-#include "nbl/video/alloc/SimpleGPUBufferAllocator.h"
-
-namespace nbl::video
-{
-
-//class ILogicalDevice;
-
-class StreamingGPUBufferAllocator : protected SimpleGPUBufferAllocator
-{
-    private:
-        void* mapWrapper(IDriverMemoryAllocation* mem, IDriverMemoryAllocation::E_MAPPING_CPU_ACCESS_FLAG access, const IDriverMemoryAllocation::MemoryRange& range) noexcept;
-        void unmapWrapper(IDriverMemoryAllocation* mem) noexcept;
-
-    public:
-        struct value_type
-        {
-            typename SimpleGPUBufferAllocator::value_type buffer;
-            uint8_t* ptr;
-        };
-
-        StreamingGPUBufferAllocator(ILogicalDevice* inDriver, const IDriverMemoryBacked::SDriverMemoryRequirements& bufferReqs) : SimpleGPUBufferAllocator(inDriver,bufferReqs)
-        {
-            assert(mBufferMemReqs.mappingCapability&IDriverMemoryAllocation::EMCAF_READ_AND_WRITE); // have to have mapping access to the buffer!
-        }
-        virtual ~StreamingGPUBufferAllocator() = default;
-
-        inline value_type   allocate(size_t bytes, size_t alignment) noexcept
-        {
-            auto buff =  SimpleGPUBufferAllocator::allocate(bytes,alignment);
-            if (!buff)
-                return {nullptr,nullptr};
-            auto* const mem = buff->getBoundMemory();
-            uint8_t* mappedPtr;
-            if (mem->isCurrentlyMapped())
-            {
-                assert(mem->getMappedRange().offset==0ull && mem->getMappedRange().length==mem->getAllocationSize()); // whole range must be mapped always
-                mappedPtr = reinterpret_cast<uint8_t*>(mem->getMappedPointer());
-            }
-            else
-            {
-                const auto mappingCaps = mem->getMappingCaps()&IDriverMemoryAllocation::EMCAF_READ_AND_WRITE;
-                const auto rangeToMap = IDriverMemoryAllocation::MemoryRange{0u,mem->getAllocationSize()};
-                mappedPtr = reinterpret_cast<uint8_t*>(mapWrapper(mem,static_cast<IDriverMemoryAllocation::E_MAPPING_CPU_ACCESS_FLAG>(mappingCaps),rangeToMap));
-            }
-            if (!mappedPtr)
-            {
-                SimpleGPUBufferAllocator::deallocate(buff);
-                return {nullptr,nullptr};
-            }
-            mappedPtr += buff->getBoundMemoryOffset();
-            return {std::move(buff),mappedPtr};
-        }
-
-        inline void                 deallocate(value_type& allocation) noexcept
-        {
-            allocation.ptr = nullptr;
-            auto* mem = allocation.buffer->getBoundMemory();
-            if (mem->getReferenceCount()==1)
-                unmapWrapper(mem);
-            SimpleGPUBufferAllocator::deallocate(allocation.buffer);
-        }
-#if 0
-        //to expose base functions again
-        ILogicalDevice*   getDriver() noexcept {return SimpleGPUBufferAllocator::getDriver();}
-#endif
-};
-
-
-}
-
-#endif
diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h
index 38e6d2cca6..a300dc1c2d 100644
--- a/include/nbl/video/declarations.h
+++ b/include/nbl/video/declarations.h
@@ -1,9 +1,8 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_VIDEO_DECLARATIONS_H_INCLUDED__
-#define __NBL_VIDEO_DECLARATIONS_H_INCLUDED__
+#ifndef _NBL_VIDEO_DECLARATIONS_H_INCLUDED_
+#define _NBL_VIDEO_DECLARATIONS_H_INCLUDED_
 
 
 // dependencies
@@ -16,7 +15,6 @@
 //#include "nbl/video/asset_traits.h"
 
 // alloc
-#include "nbl/video/alloc/CStreamingBufferAllocator.h"
 #include "nbl/video/alloc/StreamingTransientDataBuffer.h"
 
 // platform and API specific stuff
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 84c6b6c7ae..517485d08c 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -256,9 +256,6 @@ set(NBL_ASSET_SOURCES
 	${NBL_ROOT_PATH}/src/nbl/asset/material_compiler/CMaterialCompilerGLSLRasterBackend.cpp
 )
 set(NBL_VIDEO_SOURCES
-# Allocators
-	${NBL_ROOT_PATH}/src/nbl/video/alloc/CSimpleBufferAllocator.cpp
-
 # Utilities
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/ICommandPoolCache.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp
diff --git a/src/nbl/video/alloc/CSimpleBufferAllocator.cpp b/src/nbl/video/alloc/CSimpleBufferAllocator.cpp
deleted file mode 100644
index 54666f9b64..0000000000
--- a/src/nbl/video/alloc/CSimpleBufferAllocator.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#include "nbl/video/IPhysicalDevice.h"
-#include "nbl/video/ILogicalDevice.h"
-#include "nbl/video/alloc/CSimpleBufferAllocator.h"
-
-using namespace nbl;
-using namespace video;
-
-CSimpleBufferAllocator::value_type CSimpleBufferAllocator::allocate(
-    IGPUBuffer::SCreationParams&& creationParams,
-    const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags)
-{
-    auto buffer = m_device->createBuffer(std::move(creationParams));
-    auto reqs = buffer->getMemoryReqs();
-    reqs.memoryTypeBits &= m_memoryTypesToUse;
-    auto mem = m_device->allocate(reqs,buffer.get(),allocateFlags);
-    if (!mem.memory)
-        return {0xdeadbeefull,nullptr};
-    return {0ull,std::move(buffer)};
-}
\ No newline at end of file

From 3f41a81b8515ae5ee79d2e271776886732c7061f Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Tue, 9 Jan 2024 00:50:17 +0100
Subject: [PATCH 26/62] fix one liner huge bug

---
 include/nbl/builtin/hlsl/cpp_compat/intrinsics.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h
index 1c75abe891..c606bb4d58 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h
+++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h
@@ -53,7 +53,7 @@ inline T determinant(const matrix<T,N,M>& m)
 
 NBL_BIT_OP_GLM_PASSTHROUGH(findLSB,findLSB)
 
-NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findLSB)
+NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB)
 
 // inverse not defined cause its implemented via hidden friend
 template<typename T, uint16_t N, uint16_t M>

From fb1f50dc6b58564fb137dc0c78efa00e2ce46bcc Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Tue, 9 Jan 2024 01:07:39 +0100
Subject: [PATCH 27/62] fix a smal bug and introduce a base class for
 TimelineEventHandler, also get everything to compile

---
 include/nbl/video/TimelineEventHandlers.h     | 48 +++++++++++--------
 .../alloc/CAsyncSingleBufferSubAllocator.h    |  7 +--
 .../alloc/StreamingTransientDataBuffer.h      | 24 +++++-----
 3 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 902f82c8aa..925829c68f 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -9,28 +9,48 @@
 
 namespace nbl::video
 {
+class TimelineEventHandlerBase : core::Unmovable, core::Uncopyable
+{
+    public:
+        struct PollResult
+        {
+            uint32_t eventsLeft = ~0u;
+            bool bailed = false;
+        };
+
+        // little utility
+        inline const ISemaphore* getSemaphore() const { return m_sema.get(); }
+        
+        // todo: rename to default_wait_point ?
+        template<class Clock=std::chrono::steady_clock>
+        static inline Clock::time_point default_wait()
+        {
+            return Clock::now()+std::chrono::microseconds(50);
+        }
+
+    protected:
+        TimelineEventHandlerBase(core::smart_refctd_ptr<const ISemaphore>&& sema) : m_sema(std::move(sema)) {}
+
+        core::smart_refctd_ptr<const ISemaphore> m_sema;
+};
+
 template<typename Functor>
 class MultiTimelineEventHandlerST;
 
 // Could be made MT and relatively lockless, if only had a good lock-few circular buffer impl
 // Not sure its worth the effort as anything using this will probably need to be lockful to be MT
 template<typename Functor>
-class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
+class TimelineEventHandlerST final : TimelineEventHandlerBase
 {
     public:
         // Theoretically could make a factory function cause passing a null semaphore is invalid, but counting on users to be relatively intelligent.
         inline TimelineEventHandlerST(core::smart_refctd_ptr<const ISemaphore>&& sema, const uint64_t initialCapacity=4095/sizeof(FunctorValuePair)+1) :
-            m_sema(std::move(sema)), m_greatestLatch(0)
-        {
-            m_greatestSignal = m_sema->getCounterValue();
-        }
+            TimelineEventHandlerBase(std::move(sema)), m_greatestLatch(0), m_greatestSignal(m_sema->getCounterValue()) {}
         // If you don't want to deadlock here, look into the `abort*` family of methods
         ~TimelineEventHandlerST()
         {
             while (wait(std::chrono::steady_clock::now()+std::chrono::seconds(5))) {}
         }
-        // little utility
-        inline const ISemaphore* getSemaphore() const {return m_sema.get();}
 
         inline uint32_t count() const {return m_cb.size();}
 
@@ -44,23 +64,12 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         }
 
         //
-        struct PollResult
-        {
-            uint32_t eventsLeft = ~0u;
-            bool bailed = false;
-        };
         template<typename... Args>
         inline PollResult poll(Args&&... args)
         {
             return poll_impl<true>(std::forward<Args>(args)...);
         }
 
-        template<class Clock=std::chrono::steady_clock>
-        static inline Clock::time_point default_wait()
-        {
-            return Clock::now()+std::chrono::microseconds(50);
-        }
-
         template<class Clock, class Duration=typename Clock::duration, typename... Args>
         inline uint32_t wait(const std::chrono::time_point<Clock,Duration>& timeout_time, Args&&... args)
         {
@@ -160,7 +169,6 @@ class TimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         };
         // could be a circular buffer but whatever for now
         core::deque<FunctorValuePair> m_cb;
-        core::smart_refctd_ptr<const ISemaphore> m_sema;
         uint64_t m_greatestSignal;
         uint64_t m_greatestLatch;
 
@@ -410,7 +418,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
         inline container_t::iterator eraseTimeline(typename container_t::iterator timeline)
         {
             // if not the last in scratch
-            if (timeline->waitInfoIx<m_scratchWaitInfos.size())
+            if (timeline->waitInfoIx+1<m_scratchWaitInfos.size())
             {
                 // swap the mapping with the end scratch element
                 const auto& lastScratch = m_scratchWaitInfos.back();
diff --git a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
index f7b95464a7..69f239f888 100644
--- a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
+++ b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
@@ -117,7 +117,8 @@ class CAsyncSingleBufferSubAllocator
 
         // perfect forward ctor to `CSingleBufferSubAllocator`
         template<typename... Args>
-        inline CAsyncSingleBufferSubAllocator(Args&&... args) : m_composed(std::forward<Args>(args)...) {}
+        inline CAsyncSingleBufferSubAllocator(Args&&... args) : m_composed(std::forward<Args>(args)...),
+            deferredFrees(core::smart_refctd_ptr<ILogicalDevice>(const_cast<ILogicalDevice*>(m_composed.getBuffer()->getOriginDevice()))) {}
         virtual ~CAsyncSingleBufferSubAllocator() {}
 
 
@@ -135,7 +136,7 @@ class CAsyncSingleBufferSubAllocator
             std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
             assert(tLock.owns_lock());
             #endif // _NBL_DEBUG
-            return deferredFrees.poll();
+            return deferredFrees.poll().eventsLeft;
         }
 
         //! Returns max possible currently allocatable single allocation size, without having to wait for GPU more
@@ -200,7 +201,7 @@ class CAsyncSingleBufferSubAllocator
             std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
             assert(tLock.owns_lock());
             #endif // _NBL_DEBUG
-            multi_deallocate(count,addr,bytes,nullptr);
+            multi_deallocate(count,addr,bytes,{});
         }
         // TODO: improve signature of this function in the future
         template<typename T=core::IReferenceCounted>
diff --git a/include/nbl/video/alloc/StreamingTransientDataBuffer.h b/include/nbl/video/alloc/StreamingTransientDataBuffer.h
index 27c3ec7e81..fd8eee7347 100644
--- a/include/nbl/video/alloc/StreamingTransientDataBuffer.h
+++ b/include/nbl/video/alloc/StreamingTransientDataBuffer.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_VIDEO_STREAMING_TRANSIENT_DATA_BUFFER_H_
@@ -15,7 +15,6 @@
 namespace nbl::video
 {
 
-#if 0 // TODO: port
 template<class HostAllocator=core::allocator<uint8_t>, class RecursiveLockable=std::recursive_mutex>
 class StreamingTransientDataBufferMT;
 
@@ -39,24 +38,28 @@ class StreamingTransientDataBuffer
         template<typename... Args>
         inline StreamingTransientDataBuffer(asset::SBufferRange<IGPUBuffer>&& _bufferRange, Args&&... args) : m_composed(std::move(_bufferRange),std::forward<Args>(args)...)
         {
-            assert(getBuffer()->getBoundMemory()->isMappable());
-            assert(getBuffer()->getBoundMemory()->getMappedPointer());
+            assert(getBuffer()->getBoundMemory().memory->isMappable());
+            assert(getBuffer()->getBoundMemory().memory->getMappedPointer());
             // we're suballocating from a buffer, whole buffer needs to be reachable from the mapped pointer
-            const auto mappedRange = getBuffer()->getBoundMemory()->getMappedRange();
-            assert(mappedRange.offset<=getBuffer()->getBoundMemoryOffset());
-            assert(mappedRange.offset+mappedRange.length>=getBuffer()->getBoundMemoryOffset()+getBuffer()->getSize());
+            const auto mappedRange = getBuffer()->getBoundMemory().memory->getMappedRange();
+            assert(mappedRange.offset<=getBuffer()->getBoundMemory().offset);
+            assert(mappedRange.offset+mappedRange.length>=getBuffer()->getBoundMemory().offset+getBuffer()->getSize());
         }
         virtual ~StreamingTransientDataBuffer() {}
 
         //
-        inline bool needsManualFlushOrInvalidate() const {return getBuffer()->getBoundMemory()->haveToMakeVisible();}
+        inline bool needsManualFlushOrInvalidate() const {return getBuffer()->getBoundMemory().memory->haveToMakeVisible();}
 
         // getters
         inline IGPUBuffer* getBuffer() noexcept {return m_composed.getBuffer();}
         inline const IGPUBuffer* getBuffer() const noexcept {return m_composed.getBuffer();}
 
         //
-        inline void* getBufferPointer() noexcept {return getBuffer()->getBoundMemory()->getMappedPointer();}
+        inline void* getBufferPointer() noexcept
+        {
+            const auto bound = getBuffer()->getBoundMemory();
+            return reinterpret_cast<uint8_t*>(bound.memory->getMappedPointer())+bound.offset;
+        }
 
         //
         inline uint32_t cull_frees() noexcept {return m_composed.cull_frees();}
@@ -99,7 +102,7 @@ class StreamingTransientDataBuffer
         template<typename... Args>
         inline size_type multi_place(uint32_t count, Args&&... args) noexcept
         {
-            return multi_place(GPUEventWrapper::default_wait(), count, std::forward<Args>(args)...);
+            return multi_place(TimelineEventHandlerBase::default_wait(),count,std::forward<Args>(args)...);
         }
 };
 }
@@ -206,7 +209,6 @@ class StreamingTransientDataBufferMT : public core::IReferenceCounted
             return lock;
         }
 };
-#endif
 
 }
 

From 94ee6805d671cff23b2b42e58e3157dc64e32ee0 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Tue, 9 Jan 2024 01:09:23 +0100
Subject: [PATCH 28/62] fix one more KHR function pointer bug and remove unused
 class

---
 .../nbl/video/alloc/SubAllocatedDataBuffer.h  | 304 ------------------
 src/nbl/video/CVulkanLogicalDevice.cpp        |   2 +-
 2 files changed, 1 insertion(+), 305 deletions(-)
 delete mode 100644 include/nbl/video/alloc/SubAllocatedDataBuffer.h

diff --git a/include/nbl/video/alloc/SubAllocatedDataBuffer.h b/include/nbl/video/alloc/SubAllocatedDataBuffer.h
deleted file mode 100644
index 8b09d1bf45..0000000000
--- a/include/nbl/video/alloc/SubAllocatedDataBuffer.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_VIDEO_SUB_ALLOCATED_DATA_BUFFER_H__
-#define __NBL_VIDEO_SUB_ALLOCATED_DATA_BUFFER_H__
-
-#include "nbl/core/declarations.h"
-
-#include <type_traits>
-#include <mutex>
-
-#include "nbl/video/alloc/SimpleGPUBufferAllocator.h"
-#include "nbl/video/IGPUFence.h"
-
-namespace nbl::video
-{
-
-namespace impl
-{
-template<class HeterogenousMemoryAddressAllocator, class CustomDeferredFreeFunctor=void>
-class SubAllocatedDataBuffer : protected core::impl::FriendOfHeterogenousMemoryAddressAllocatorAdaptor
-{
-    public:
-        typedef typename HeterogenousMemoryAddressAllocator::OtherAllocatorType GPUBufferAllocator;
-        typedef typename HeterogenousMemoryAddressAllocator::HostAllocatorType  CPUAllocator;
-        typedef typename HeterogenousMemoryAddressAllocator::size_type          size_type;
-        static constexpr size_type invalid_address                              = HeterogenousMemoryAddressAllocator::invalid_address;
-
-    private:
-        #ifdef _NBL_DEBUG
-        std::recursive_mutex stAccessVerfier;
-        #endif // _NBL_DEBUG
-        typedef SubAllocatedDataBuffer<HeterogenousMemoryAddressAllocator,CustomDeferredFreeFunctor> ThisType;
-
-        template<class U> using buffer_type = decltype(U::buffer);
-        template<class,class=void> struct has_buffer_member : std::false_type {};
-        template<class U> struct has_buffer_member<U,std::void_t<buffer_type<U>> > : std::is_same<buffer_type<U>,core::smart_refctd_ptr<IGPUBuffer>> {};
-    protected:
-        HeterogenousMemoryAddressAllocator mAllocator;
-        ILogicalDevice* mDevice; // TODO: smartpointer backlink
-
-        template<typename... Args>
-        inline size_type    try_multi_alloc(uint32_t count, size_type* outAddresses, const size_type* bytes, const Args&... args) noexcept
-        {
-            mAllocator.multi_alloc_addr(count,outAddresses,bytes,args...);
-
-            size_type unallocatedSize = 0;
-            for (uint32_t i=0u; i<count; i++)
-            {
-                if (outAddresses[i]!=invalid_address)
-                    continue;
-
-                unallocatedSize += bytes[i];
-            }
-            return unallocatedSize;
-        }
-
-        inline auto& getFunctorAllocator() noexcept {return functorAllocator;} // TODO : RobustGeneralpurposeAllocator a-la naughty dog
-
-        class DefaultDeferredFreeFunctor
-        {
-            private:
-                ThisType*   sadbRef;
-                size_type*  rangeData;
-                size_type   numAllocs;
-            public:
-                template<typename T>
-				inline DefaultDeferredFreeFunctor(ThisType* _this, size_type numAllocsToFree, const size_type* addrs, const size_type* bytes, const T*const *const objectsToHold)
-                                                    : sadbRef(_this), rangeData(nullptr), numAllocs(numAllocsToFree)
-                {
-                    static_assert(std::is_base_of_v<core::IReferenceCounted,T>);
-                    
-                    rangeData = reinterpret_cast<size_type*>(sadbRef->getFunctorAllocator().allocate(numAllocs,sizeof(void*)));
-                    auto out = rangeData;
-                    memcpy(out,addrs,sizeof(size_type)*numAllocs);
-                    out += numAllocs;
-                    memcpy(out,bytes,sizeof(size_type)*numAllocs);
-                    out += numAllocs;
-                    auto* const objHoldIt = reinterpret_cast<core::smart_refctd_ptr<const core::IReferenceCounted>*>(out);
-                    for (size_t i=0u; i<numAllocs; i++)
-                    {
-                        reinterpret_cast<const void**>(out)[i] = nullptr; // clear it first
-                        if (objectsToHold)
-                            objHoldIt[i] = core::smart_refctd_ptr<const core::IReferenceCounted>(objectsToHold[i]);
-                    }
-                }
-                DefaultDeferredFreeFunctor(const DefaultDeferredFreeFunctor& other) = delete;
-				inline DefaultDeferredFreeFunctor(DefaultDeferredFreeFunctor&& other) : sadbRef(nullptr), rangeData(nullptr), numAllocs(0u)
-                {
-                    this->operator=(std::forward<DefaultDeferredFreeFunctor>(other));
-                }
-
-				inline ~DefaultDeferredFreeFunctor()
-                {
-                    if (rangeData)
-                    {
-                        auto alloctr = sadbRef->getFunctorAllocator();
-                        alloctr.deallocate(reinterpret_cast<std::remove_pointer_t<decltype(alloctr)>::pointer>(rangeData),numAllocs);
-                    }
-                }
-
-                DefaultDeferredFreeFunctor& operator=(const DefaultDeferredFreeFunctor& other) = delete;
-                inline DefaultDeferredFreeFunctor& operator=(DefaultDeferredFreeFunctor&& other)
-                {
-                    if (rangeData) // could swap the values instead
-                    {
-                        auto alloctr = sadbRef->getFunctorAllocator();
-                        alloctr.deallocate(reinterpret_cast<std::remove_pointer_t<decltype(alloctr)>::pointer>(rangeData),numAllocs);
-                    }
-                    sadbRef    = other.sadbRef;
-                    rangeData   = other.rangeData;
-                    numAllocs   = other.numAllocs;
-                    other.sadbRef  = nullptr;
-                    other.rangeData = nullptr;
-                    other.numAllocs = 0u;
-                    return *this;
-                }
-
-                inline bool operator()(size_type& unallocatedSize)
-                {
-                    operator()();
-                    for (size_type i=0u; i<numAllocs; i++)
-                    {
-                        auto freedSize = rangeData[numAllocs+i];
-                        if (unallocatedSize>freedSize)
-                            unallocatedSize -= freedSize;
-                        else
-                        {
-                            unallocatedSize = 0u;
-                            return true;
-                        }
-                    }
-                    return unallocatedSize==0u;
-                }
-
-                inline void operator()()
-                {
-                    #ifdef _NBL_DEBUG
-                    assert(sadbRef && rangeData);
-                    #endif // _NBL_DEBUG
-                    HeterogenousMemoryAddressAllocator& alloctr = sadbRef->getAllocator();
-                    alloctr.multi_free_addr(numAllocs,rangeData,rangeData+numAllocs);
-                    auto* const objHoldIt = reinterpret_cast<core::smart_refctd_ptr<const core::IReferenceCounted>*>(rangeData+numAllocs*2u);
-                    for (size_t i=0u; i<numAllocs; i++)
-                        objHoldIt[i] = nullptr;
-                }
-        };
-        constexpr static bool UsingDefaultFunctor = std::is_same<CustomDeferredFreeFunctor,void>::value;
-        using DeferredFreeFunctor = std::conditional_t<UsingDefaultFunctor,DefaultDeferredFreeFunctor,CustomDeferredFreeFunctor>;
-        GPUDeferredEventHandlerST<DeferredFreeFunctor> deferredFrees;
-        core::allocator<std::tuple<size_type,size_type,void*> > functorAllocator; // TODO : CMemoryPool<RobustGeneralpurposeAllocator> a-la naughty do
-    public:
-        SubAllocatedDataBuffer() {}
-
-        virtual ~SubAllocatedDataBuffer() {}
-
-        //!
-        template<typename... Args>
-        SubAllocatedDataBuffer(ILogicalDevice* dev, Args&&... args) : mAllocator(std::forward<Args>(args)...), mDevice(dev)
-        {
-            #ifdef _NBL_DEBUG
-            std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
-            assert(tLock.owns_lock());
-            #endif // _NBL_DEBUG
-        }
-
-
-        //! Mutable version for `DefaultDeferredFreeFunctor` and `StreamingTransientDataBuffer` ONLY!
-        inline HeterogenousMemoryAddressAllocator& getAllocator() noexcept { return mAllocator; }
-        //!
-        const HeterogenousMemoryAddressAllocator& getAllocator() const {return mAllocator;}
-        
-        //!
-        inline const IGPUBuffer*  getBuffer() const noexcept
-        {
-            auto allocation = mAllocator.getCurrentBufferAllocation();
-
-            IGPUBuffer* retval;
-			if constexpr(has_buffer_member<decltype(allocation)>::value)
-			{
-				retval = allocation.buffer.get();
-			}
-			else
-			{
-				retval = allocation.get();
-			}
-			
-
-            return retval;
-        }
-        inline IGPUBuffer* getBuffer() noexcept
-        {
-            return const_cast<IGPUBuffer*>(static_cast<const ThisType*>(this)->getBuffer());
-        }
-
-        //!
-        inline uint32_t         cull_frees() noexcept
-        {
-            #ifdef _NBL_DEBUG
-            std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
-            assert(tLock.owns_lock());
-            #endif // _NBL_DEBUG
-            return deferredFrees.cullEvents(0u);
-        }
-
-        //! Returns max possible currently allocatable single allocation size, without having to wait for GPU more
-        inline size_type    max_size() noexcept
-        {
-            #ifdef _NBL_DEBUG
-            std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
-            assert(tLock.owns_lock());
-            #endif // _NBL_DEBUG
-            size_type valueToStopAt = mAllocator.getAddressAllocator().min_size()*3u; // padding, allocation, more padding = 3u
-            // we don't actually want or need to poll all possible blocks to free, only first few
-            deferredFrees.pollForReadyEvents(valueToStopAt);
-            return mAllocator.getAddressAllocator().max_size();
-        }
-        //! Returns max requestable alignment on the allocation (w.r.t. backing memory start)
-        inline size_type    max_alignment() const noexcept {return mAllocator.getAddressAllocator().max_alignment();}
-
-
-        //!
-        template<typename... Args>
-        inline size_type    multi_alloc(uint32_t count, Args&&... args) noexcept
-        {
-            return multi_alloc(GPUEventWrapper::default_wait(),count,std::forward<Args>(args)...);
-        }
-        //!
-        template<class Clock=typename std::chrono::steady_clock, typename... Args>
-        inline size_type    multi_alloc(const std::chrono::time_point<Clock>& maxWaitPoint, const Args&... args) noexcept
-        {
-            #ifdef _NBL_DEBUG
-            std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
-            assert(tLock.owns_lock());
-            #endif // _NBL_DEBUG
-
-            // try allocate once
-            size_type unallocatedSize = try_multi_alloc(args...);
-            if (!unallocatedSize)
-                return 0u;
-
-            // then try to wait at least once and allocate
-            do
-            {
-                deferredFrees.waitUntilForReadyEvents(maxWaitPoint,unallocatedSize);
-
-                unallocatedSize = try_multi_alloc(args...);
-                if (!unallocatedSize)
-                    return 0u;
-            } while(Clock::now()<maxWaitPoint);
-
-            return unallocatedSize;
-        }
-        //!
-        inline void         multi_free(core::smart_refctd_ptr<IGPUFence>&& fence, DeferredFreeFunctor&& functor) noexcept
-        {
-            #ifdef _NBL_DEBUG
-            std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
-            assert(tLock.owns_lock());
-            #endif // _NBL_DEBUG
-            deferredFrees.addEvent(GPUEventWrapper(mDevice, std::move(fence)),std::forward<DeferredFreeFunctor>(functor));
-        }
-        inline void         multi_free(uint32_t count, const size_type* addr, const size_type* bytes) noexcept
-        {
-            #ifdef _NBL_DEBUG
-            std::unique_lock<std::recursive_mutex> tLock(stAccessVerfier,std::try_to_lock_t());
-            assert(tLock.owns_lock());
-            #endif // _NBL_DEBUG
-            mAllocator.multi_free_addr(count,addr,bytes);
-        }
-        template<typename T=core::IReferenceCounted>
-        inline void         multi_free(uint32_t count, const size_type* addr, const size_type* bytes, core::smart_refctd_ptr<IGPUFence>&& fence, const T*const *const objectsToDrop=nullptr) noexcept
-        {
-            if (fence)
-                multi_free(std::move(fence),DeferredFreeFunctor(this,count,addr,bytes,objectsToDrop));
-            else
-                multi_free(count,addr,bytes);
-        }
-};
-}
-
-// this buffer is not growable
-template< typename _size_type=uint32_t, class BasicAddressAllocator=core::GeneralpurposeAddressAllocator<_size_type>, class GPUBufferAllocator=SimpleGPUBufferAllocator, class CPUAllocator=core::allocator<uint8_t> >
-class SubAllocatedDataBufferST : public core::IReferenceCounted, public impl::SubAllocatedDataBuffer<core::HeterogenousMemoryAddressAllocatorAdaptor<BasicAddressAllocator, GPUBufferAllocator, CPUAllocator> >
-{
-        using Base = impl::SubAllocatedDataBuffer<core::HeterogenousMemoryAddressAllocatorAdaptor<BasicAddressAllocator,GPUBufferAllocator,CPUAllocator> >;
-    protected:
-        ~SubAllocatedDataBufferST() = default;
-    public:
-        template<typename... Args>
-        SubAllocatedDataBufferST(Args&&... args) : Base(std::forward<Args>(args)...) {}
-};
-
-
-//MT version?
-
-}
-
-#endif
-
-
-
-
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 0714481ac8..844bfc54cb 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -307,7 +307,7 @@ bool CVulkanLogicalDevice::bindBufferMemory_impl(const uint32_t count, const SBi
         {
             VkBufferDeviceAddressInfoKHR info = {VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR,nullptr};
             info.buffer = vulkanBuffer->getInternalObject();
-            vulkanBuffer->setDeviceAddress(m_devf.vk.vkGetBufferDeviceAddressKHR(m_vkdev,&info));
+            vulkanBuffer->setDeviceAddress(m_devf.vk.vkGetBufferDeviceAddress(m_vkdev,&info));
         }
     }
     return true;

From c761d424dc5c51754a731de29a416d93315957a1 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Tue, 9 Jan 2024 04:43:05 +0100
Subject: [PATCH 29/62] bring back bits of IUtilities needed for ex 05

---
 include/nbl/video/utilities/IUtilities.h | 82 ++++++++++++------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 029df30144..89d45da3f7 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -1,3 +1,6 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_VIDEO_I_UTILITIES_H_INCLUDED_
 #define _NBL_VIDEO_I_UTILITIES_H_INCLUDED_
 
@@ -16,7 +19,6 @@
 namespace nbl::video
 {
 
-#if 0 // TODO: port
 class NBL_API2 IUtilities : public core::IReferenceCounted
 {
     protected:
@@ -29,9 +31,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         nbl::system::logger_opt_smart_ptr m_logger;
 
     public:
-        IUtilities(core::smart_refctd_ptr<ILogicalDevice>&& device, nbl::system::logger_opt_smart_ptr&& logger = nullptr, const uint32_t downstreamSize = 0x4000000u, const uint32_t upstreamSize = 0x4000000u)
-            : m_device(std::move(device))
-            , m_logger(std::move(logger))
+        IUtilities(core::smart_refctd_ptr<ILogicalDevice>&& device, nbl::system::logger_opt_smart_ptr&& logger=nullptr, const uint32_t downstreamSize=0x4000000u, const uint32_t upstreamSize=0x4000000u)
+            : m_device(std::move(device)), m_logger(std::move(logger))
         {
             auto physicalDevice = m_device->getPhysicalDevice();
             const auto& limits = physicalDevice->getLimits();
@@ -39,22 +40,23 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             auto queueFamProps = physicalDevice->getQueueFamilyProperties();
             uint32_t minImageTransferGranularityVolume = 1u; // minImageTransferGranularity.width * height * depth
 
-            for (uint32_t i = 0; i < queueFamProps.size(); i++)
+            for (auto& qf : queueFamProps)
             {
-                uint32_t volume = queueFamProps[i].minImageTransferGranularity.width * queueFamProps[i].minImageTransferGranularity.height * queueFamProps[i].minImageTransferGranularity.depth;
-                if(minImageTransferGranularityVolume < volume)
+                uint32_t volume = qf.minImageTransferGranularity.width*qf.minImageTransferGranularity.height*qf.minImageTransferGranularity.depth;
+                if(minImageTransferGranularityVolume<volume)
                     minImageTransferGranularityVolume = volume;
             }
 
             // host-mapped device memory needs to have this alignment in flush/invalidate calls, therefore this is the streaming buffer's "allocationAlignment".
-            m_allocationAlignment = static_cast<uint32_t>(limits.nonCoherentAtomSize);
-            m_allocationAlignmentForBufferImageCopy = core::max(static_cast<uint32_t>(limits.optimalBufferCopyOffsetAlignment), m_allocationAlignment);
+            m_allocationAlignment = limits.nonCoherentAtomSize;
+            m_allocationAlignmentForBufferImageCopy = core::max<uint32_t>(limits.optimalBufferCopyOffsetAlignment,m_allocationAlignment);
 
-            const uint32_t bufferOptimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t);
+            constexpr uint32_t OptimalCoalescedInvocationXferSize = sizeof(uint32_t);
+            const uint32_t bufferOptimalTransferAtom = limits.maxResidentInvocations * OptimalCoalescedInvocationXferSize;
             const uint32_t maxImageOptimalTransferAtom = limits.maxResidentInvocations * asset::TexelBlockInfo(asset::EF_R64G64B64A64_SFLOAT).getBlockByteSize() * minImageTransferGranularityVolume;
-            const uint32_t minImageOptimalTransferAtom = limits.maxResidentInvocations * asset::TexelBlockInfo(asset::EF_R8_UINT).getBlockByteSize();;
-            const uint32_t maxOptimalTransferAtom = core::max(bufferOptimalTransferAtom, maxImageOptimalTransferAtom);
-            const uint32_t minOptimalTransferAtom = core::min(bufferOptimalTransferAtom, minImageOptimalTransferAtom);
+            const uint32_t minImageOptimalTransferAtom = limits.maxResidentInvocations * asset::TexelBlockInfo(asset::EF_R8_UINT).getBlockByteSize();
+            const uint32_t maxOptimalTransferAtom = core::max(bufferOptimalTransferAtom,maxImageOptimalTransferAtom);
+            const uint32_t minOptimalTransferAtom = core::min(bufferOptimalTransferAtom,minImageOptimalTransferAtom);
 
             // allocationAlignment <= minBlockSize <= minOptimalTransferAtom <= maxOptimalTransferAtom <= stagingBufferSize/4
             assert(m_allocationAlignment <= minStreamingBufferAllocationSize);
@@ -62,8 +64,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
 
             assert(minStreamingBufferAllocationSize <= minOptimalTransferAtom);
 
-            assert(maxOptimalTransferAtom * 4u <= upstreamSize);
-            assert(maxOptimalTransferAtom * 4u <= downstreamSize);
+            assert(maxOptimalTransferAtom*OptimalCoalescedInvocationXferSize <= upstreamSize);
+            assert(maxOptimalTransferAtom*OptimalCoalescedInvocationXferSize <= downstreamSize);
 
             assert(minStreamingBufferAllocationSize % m_allocationAlignment == 0u);
             assert(minStreamingBufferAllocationSize % m_allocationAlignmentForBufferImageCopy == 0u);
@@ -71,15 +73,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             const auto& enabledFeatures = m_device->getEnabledFeatures();
 
             IGPUBuffer::SCreationParams streamingBufferCreationParams = {};
-            auto commonUsages = core::bitflag(IGPUBuffer::EUF_STORAGE_TEXEL_BUFFER_BIT)|IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-            if(enabledFeatures.bufferDeviceAddress)
-                commonUsages |= IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+            auto commonUsages = core::bitflag(IGPUBuffer::EUF_STORAGE_TEXEL_BUFFER_BIT)|IGPUBuffer::EUF_STORAGE_BUFFER_BIT|IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
             if (enabledFeatures.accelerationStructure)
                 commonUsages |= IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
             
-            core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags(IDeviceMemoryAllocation::EMAF_NONE);
-            if(enabledFeatures.bufferDeviceAddress)
-                allocateFlags |= IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT;
+            core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 
             {
                 IGPUBuffer::SCreationParams streamingBufferCreationParams = {};
@@ -102,8 +100,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT))
                     access |= IDeviceMemoryAllocation::EMCAF_WRITE;
                 assert(access.value);
-                IDeviceMemoryAllocation::MappedMemoryRange memoryRange = {mem.get(),0ull,mem->getAllocationSize()};
-                m_device->mapMemory(memoryRange, access);
+                mem->map({0ull,reqs.size},access);
 
                 m_defaultDownloadBuffer = core::make_smart_refctd_ptr<StreamingTransientDataBufferMT<>>(asset::SBufferRange<video::IGPUBuffer>{0ull,downstreamSize,std::move(buffer)},maxStreamingBufferAllocationAlignment,minStreamingBufferAllocationSize);
                 m_defaultDownloadBuffer->getBuffer()->setObjectDebugName(("Default Download Buffer of Utilities "+std::to_string(ptrdiff_t(this))).c_str());
@@ -130,23 +127,22 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 if (memProps.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT))
                     access |= IDeviceMemoryAllocation::EMCAF_WRITE;
                 assert(access.value);
-                IDeviceMemoryAllocation::MappedMemoryRange memoryRange = {mem.get(),0ull,mem->getAllocationSize()};
-                m_device->mapMemory(memoryRange, access);
+                mem->map({0ull,reqs.size},access);
 
                 m_defaultUploadBuffer = core::make_smart_refctd_ptr<StreamingTransientDataBufferMT<>>(asset::SBufferRange<video::IGPUBuffer>{0ull,upstreamSize,std::move(buffer)},maxStreamingBufferAllocationAlignment,minStreamingBufferAllocationSize);
                 m_defaultUploadBuffer->getBuffer()->setObjectDebugName(("Default Upload Buffer of Utilities "+std::to_string(ptrdiff_t(this))).c_str());
             }
+#if 0 // TODO: port
             m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
             // smaller workgroups fill occupancy gaps better, especially on new Nvidia GPUs, but we don't want too small workgroups on mobile
             // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile
             const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u);
             m_scanner = core::make_smart_refctd_ptr<CScanner>(core::smart_refctd_ptr(m_device), scan_workgroup_size);
+#endif
         }
 
-        ~IUtilities()
+        inline ~IUtilities()
         {
-            m_device->unmapMemory(m_defaultDownloadBuffer->getBuffer()->getBoundMemory());
-            m_device->unmapMemory(m_defaultUploadBuffer->getBuffer()->getBoundMemory());
         }
 
         //!
@@ -162,6 +158,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return m_defaultDownloadBuffer.get();
         }
 
+#if 0 // TODO: port
         //!
         virtual CPropertyPoolHandler* getDefaultPropertyPoolHandler() const
         {
@@ -173,7 +170,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         {
             return m_scanner.get();
         }
-        
+#endif
         //! This function provides some guards against streamingBuffer fragmentation or allocation failure
         static uint32_t getAllocationSizeForStreamingBuffer(const size_t size, const uint64_t alignment, uint32_t maxFreeBlock, const uint32_t optimalTransferAtom)
         {
@@ -198,6 +195,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return allocationSize;
         }
 
+#if 0 // TODO: port
         //! WARNING: This function blocks the CPU and stalls the GPU!
         inline core::smart_refctd_ptr<IGPUBuffer> createFilledDeviceLocalBufferOnDedMem(IQueue* queue, IGPUBuffer::SCreationParams&& params, const void* data)
         {
@@ -396,6 +394,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         
 
         // pipelineBarrierAutoSubmit?
+#endif
 
         // --------------
         // downloadBufferRangeViaStagingBuffer
@@ -406,9 +405,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
 
         struct default_data_consumption_callback_t
         {
-            default_data_consumption_callback_t(void* dstPtr) :
-                m_dstPtr(dstPtr)
-            {}
+            default_data_consumption_callback_t(void* dstPtr) : m_dstPtr(dstPtr) {}
 
             inline void operator()(const size_t dstOffset, const void* srcPtr, const size_t size)
             {
@@ -444,8 +441,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                     if (m_downstreamingBuffer->needsManualFlushOrInvalidate())
                     {
                         const auto nonCoherentAtomSize = device->getPhysicalDevice()->getLimits().nonCoherentAtomSize;
-                        auto flushRange = AlignedMappedMemoryRange(m_downstreamingBuffer->getBuffer()->getBoundMemory(), m_copyRange.offset, m_copyRange.length, nonCoherentAtomSize);
-                        device->invalidateMappedMemoryRanges(1u, &flushRange);
+                        auto flushRange = AlignedMappedMemoryRange(m_downstreamingBuffer->getBuffer()->getBoundMemory().memory,m_copyRange.offset,m_copyRange.length,nonCoherentAtomSize);
+                        device->invalidateMappedMemoryRanges(1u,&flushRange);
                     }
                     // Call the function
                     const uint8_t* copySrc = reinterpret_cast<uint8_t*>(m_downstreamingBuffer->getBufferPointer()) + m_copyRange.offset;
@@ -459,7 +456,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 StreamingTransientDataBufferMT<>* m_downstreamingBuffer;
                 const size_t m_dstOffset;
         };
-
+#if 0 // TODO: port
         //! Calls the callback to copy the data to a destination Offset
         //! * IMPORTANT: To make the copies ready, IUtility::getDefaultDownStreamingBuffer()->cull_frees() should be called after the `submissionFence` is signaled.
         //! If the allocation from staging memory fails due to large image size or fragmentation then This function may need to submit the command buffer via the `submissionQueue` and then signal the fence. 
@@ -742,20 +739,21 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,
             IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {}
         );
+#endif
 
-    protected:
-        
+    protected:        
         // The application must round down the start of the range to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize,
         // and round the end of the range up to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize.
-        static IDeviceMemoryAllocation::MappedMemoryRange AlignedMappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t& off, const size_t& len, size_t nonCoherentAtomSize)
+        static ILogicalDevice::MappedMemoryRange AlignedMappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t& off, const size_t& len, size_t nonCoherentAtomSize)
         {
-            IDeviceMemoryAllocation::MappedMemoryRange range = {};
+            ILogicalDevice::MappedMemoryRange range = {};
             range.memory = mem;
             range.offset = core::alignDown(off, nonCoherentAtomSize);
             range.length = core::min(core::alignUp(len, nonCoherentAtomSize), mem->getAllocationSize());
             return range;
         }
 
+#if 0 // TODO: port
         //! Internal tool used to patch command buffers in submit info.
         class CSubmitInfoPatcher
         {
@@ -820,16 +818,18 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             core::vector<IGPUCommandBuffer*> m_allCommandBuffers;
             core::smart_refctd_ptr<IGPUCommandBuffer> m_newCommandBuffer; // if necessary, then need to hold reference to.
         };
-
+#endif
         core::smart_refctd_ptr<ILogicalDevice> m_device;
 
         core::smart_refctd_ptr<StreamingTransientDataBufferMT<> > m_defaultDownloadBuffer;
         core::smart_refctd_ptr<StreamingTransientDataBufferMT<> > m_defaultUploadBuffer;
 
+#if 0 // TODO: port
         core::smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
         core::smart_refctd_ptr<CScanner> m_scanner;
-};
 #endif
+};
+
 class ImageRegionIterator
 {
     public:

From 04689b9924898146250254883b89a272105e8ad2 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Tue, 5 Dec 2023 23:31:52 +0300
Subject: [PATCH 30/62] device cap traits

---
 .../hlsl/device_capabilities_traits.hlsl      | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl b/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl
index b56fc9a557..e7263fd062 100644
--- a/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl
+++ b/include/nbl/builtin/hlsl/device_capabilities_traits.hlsl
@@ -7,15 +7,44 @@
 #include <nbl/builtin/hlsl/member_test_macros.hlsl>
 
 #ifdef __HLSL_VERSION
+
+NBL_GENERATE_MEMBER_TESTER(shaderFloat64);
+NBL_GENERATE_MEMBER_TESTER(shaderDrawParameters);
+NBL_GENERATE_MEMBER_TESTER(subgroupArithmetic);
+NBL_GENERATE_MEMBER_TESTER(fragmentShaderPixelInterlock);
+NBL_GENERATE_MEMBER_TESTER(maxOptimallyResidentWorkgroupInvocations);
+
+#define NBL_GENERATE_GET_OR_DEFAULT(field, ty, default) \
+template<typename S, bool = has_member_##field<S>::value> struct get_or_default_##field : integral_constant<ty,S::field> {}; \
+template<typename S> struct get_or_default_##field<S,false> : integral_constant<ty,default> {};
+
 namespace nbl
 {
 namespace hlsl
 {
+
+namespace impl
+{
+NBL_GENERATE_GET_OR_DEFAULT(shaderFloat64, bool, false);
+NBL_GENERATE_GET_OR_DEFAULT(shaderDrawParameters, bool, false);
+NBL_GENERATE_GET_OR_DEFAULT(subgroupArithmetic, bool, false);
+NBL_GENERATE_GET_OR_DEFAULT(fragmentShaderPixelInterlock, bool, false);
+NBL_GENERATE_GET_OR_DEFAULT(maxOptimallyResidentWorkgroupInvocations, uint16_t, 0);
+}
+
+
 template<typename device_capabilities>
 struct device_capabilities_traits
 {
-    // TODO: check for members and default them to sane things, only do the 5 members in CJITIncludeLoader.cpp struct, we'll do the rest on `vulkan_1_3` branch with Nahim
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderFloat64                                = impl::get_or_default_shaderFloat64<device_capabilities>::value;
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderDrawParameters                         = impl::get_or_default_shaderDrawParameters<device_capabilities>::value;
+    NBL_CONSTEXPR_STATIC_INLINE bool subgroupArithmetic                           = impl::get_or_default_subgroupArithmetic<device_capabilities>::value;
+    NBL_CONSTEXPR_STATIC_INLINE bool fragmentShaderPixelInterlock                 = impl::get_or_default_fragmentShaderPixelInterlock<device_capabilities>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t maxOptimallyResidentWorkgroupInvocations = impl::get_or_default_maxOptimallyResidentWorkgroupInvocations<device_capabilities>::value;
 };
+
+#undef NBL_GENERATE_GET_OR_DEFAULT
+
 }
 }
 #endif

From 4a17eafbd5e52143fdfd7ac7aebc00dca55e9165 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Wed, 6 Dec 2023 01:09:00 +0300
Subject: [PATCH 31/62] port macros to boost pp

---
 .../nbl/builtin/hlsl/member_test_macros.hlsl  | 72 ++++++-------------
 1 file changed, 20 insertions(+), 52 deletions(-)

diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl
index 021be424ce..f9c46cc98e 100644
--- a/include/nbl/builtin/hlsl/member_test_macros.hlsl
+++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl
@@ -5,6 +5,7 @@
 #define _NBL_BUILTIN_HLSL_MEMBER_TEST_MACROS_INCLUDED_
 
 #include <nbl/builtin/hlsl/type_traits.hlsl>
+#include <boost/preprocessor.hpp>
 
 #ifdef __HLSL_VERSION
 
@@ -74,60 +75,31 @@ NBL_GENERATE_MEMBER_TESTER(z)
 NBL_GENERATE_MEMBER_TESTER(w)
 
 
-// Even though it should work for some reason tests fail
-// proof it works : https://godbolt.org/z/EzPWGnTPb
+#define NBL_REPEAT(fn, n) BOOST_PP_REPEAT(n, fn, n)
 
-#define CAT(x, y) x##y
-#define TYPE_DECLARE(n) typename Arg##n
-#define TYPE_DECLARE_DEFAULT(n) TYPE_DECLARE(n)=void
-#define TYPE_FWD(n) Arg##n
-#define DECLVAL_DECLARE(n) impl::declval<Arg##n>()
+#define NBL_TYPE_DECLARE(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n
+#define NBL_TYPE_DECLARE_DEFAULT(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n=void
+#define NBL_TYPE_FWD(z, n, x) BOOST_PP_COMMA_IF(x) Arg##n
+#define NBL_DECLVAL_DECLARE(z, n, x) impl::declval<Arg##n>() BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(BOOST_PP_INC(n), x))
 
-#define FOR_EACH0(fn)  
-#define FOR_EACH1(fn) fn(1)
-#define FOR_EACH2(fn) fn(2), FOR_EACH1(fn)
-#define FOR_EACH3(fn) fn(3), FOR_EACH2(fn)
-#define FOR_EACH4(fn) fn(4), FOR_EACH3(fn)
-#define FOR_EACH(fn, n) CAT(FOR_EACH, n)(fn)
-
-#define GENERATE_STATIC_METHOD_TESTER_SPEC0(x) \
-template<class T> \
-struct has_static_method_##x<T, typename make_void<decltype(T::x())>::type> : true_type \
-{ \
-    using return_type = decltype(T::x()); \
-    NBL_CONSTEXPR_STATIC_INLINE uint arg_count = 0; \
-}; 
-
-#define GENERATE_STATIC_METHOD_TESTER_SPEC(x, n) \
-template<class T, FOR_EACH(TYPE_DECLARE, n)> \
-struct has_static_method_##x<T, FOR_EACH(TYPE_FWD, n), typename make_void<decltype(T::x(FOR_EACH(DECLVAL_DECLARE, n)))>::type> : true_type \
+#define GENERATE_STATIC_METHOD_TESTER_SPEC(z, n, x) \
+template<class T NBL_REPEAT(NBL_TYPE_DECLARE, n)> \
+struct has_static_method_##x<T NBL_REPEAT(NBL_TYPE_FWD, n), typename make_void<decltype(T::x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n)))>::type> : true_type \
 { \
-    using return_type = decltype(T::x(FOR_EACH(DECLVAL_DECLARE, n))); \
+    using return_type = decltype(T::x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n))); \
     NBL_CONSTEXPR_STATIC_INLINE uint arg_count = n; \
 }; 
 
-#define GENERATE_STATIC_METHOD_TESTER(x) \
-template<typename T, FOR_EACH(TYPE_DECLARE_DEFAULT, 4), class=void> \
+#define GENERATE_STATIC_METHOD_TESTER(x, n) \
+template<typename T NBL_REPEAT(NBL_TYPE_DECLARE_DEFAULT, n), class=void> \
 struct has_static_method_##x : false_type {}; \
-GENERATE_STATIC_METHOD_TESTER_SPEC0(x)  \
-GENERATE_STATIC_METHOD_TESTER_SPEC(x, 1)  \
-GENERATE_STATIC_METHOD_TESTER_SPEC(x, 2)  \
-GENERATE_STATIC_METHOD_TESTER_SPEC(x, 3)  \
-GENERATE_STATIC_METHOD_TESTER_SPEC(x, 4) 
-
-#define GENERATE_METHOD_TESTER_SPEC0(x) \
-template<class T> \
-struct has_method_##x<T, typename make_void<decltype(impl::declval<T>().x())>::type> : impl::if_2_else_1<impl::has_static_method_##x<T>::value> \
-{ \
-    using return_type = decltype(impl::declval<T>().x()); \
-    NBL_CONSTEXPR_STATIC_INLINE uint arg_count = 0; \
-}; 
+BOOST_PP_REPEAT(n, GENERATE_STATIC_METHOD_TESTER_SPEC, x)
 
-#define GENERATE_METHOD_TESTER_SPEC(x, n) \
-template<class T, FOR_EACH(TYPE_DECLARE, n)> \
-struct has_method_##x<T, FOR_EACH(TYPE_FWD, n), typename make_void<decltype(impl::declval<T>().x(FOR_EACH(DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1<impl::has_static_method_##x<T,FOR_EACH(TYPE_FWD, n)>::value> \
+#define GENERATE_METHOD_TESTER_SPEC(z, n, x) \
+template<class T NBL_REPEAT(NBL_TYPE_DECLARE, n)> \
+struct has_method_##x<T NBL_REPEAT(NBL_TYPE_FWD, n), typename make_void<decltype(impl::declval<T>().x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1<impl::has_static_method_##x<T NBL_REPEAT(NBL_TYPE_FWD, n)>::value> \
 { \
-    using return_type = decltype(impl::declval<T>().x(FOR_EACH(DECLVAL_DECLARE, n))); \
+    using return_type = decltype(impl::declval<T>().x(NBL_REPEAT(NBL_DECLVAL_DECLARE, n))); \
     NBL_CONSTEXPR_STATIC_INLINE uint arg_count = n; \
 }; 
 
@@ -147,14 +119,10 @@ struct has_method_##x<T, FOR_EACH(TYPE_FWD, n), typename make_void<decltype(impl
 #define GENERATE_METHOD_TESTER(x) \
 namespace nbl { \
 namespace hlsl { \
-namespace impl { GENERATE_STATIC_METHOD_TESTER(x) } \
-template<typename T, FOR_EACH(TYPE_DECLARE_DEFAULT, 4), class=void> \
+namespace impl { GENERATE_STATIC_METHOD_TESTER(x, 4) } \
+template<typename T NBL_REPEAT(NBL_TYPE_DECLARE_DEFAULT, 4), class=void> \
 struct has_method_##x : false_type {}; \
-GENERATE_METHOD_TESTER_SPEC0(x) \
-GENERATE_METHOD_TESTER_SPEC(x, 1) \
-GENERATE_METHOD_TESTER_SPEC(x, 2) \
-GENERATE_METHOD_TESTER_SPEC(x, 3) \
-GENERATE_METHOD_TESTER_SPEC(x, 4) \
+BOOST_PP_REPEAT(4, GENERATE_METHOD_TESTER_SPEC, x) \
 }}
 
 

From 5fcad02dc0e29074f9eeb652e365299a3fc7fe2e Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Wed, 6 Dec 2023 02:10:47 +0300
Subject: [PATCH 32/62] has_member_x_with_type

---
 include/nbl/builtin/hlsl/member_test_macros.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl
index f9c46cc98e..cbceb00c9e 100644
--- a/include/nbl/builtin/hlsl/member_test_macros.hlsl
+++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl
@@ -59,12 +59,13 @@ struct is_static_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_con
 template<class T>  \
 struct is_static_member_##a<T,typename enable_if<!is_same<decltype(T::a),void>::value,void>::type>: is_const_helper<decltype(T::a), true> {}; \
 template<class T, class=void> \
-struct is_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false;}; \
+struct is_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false; using type = void; }; \
 template<class T> \
 struct is_member_##a<T,typename enable_if<!is_same<decltype(declval<T>().a),void>::value,void>::type> : is_const_helper<decltype(declval<T>().a), true>{}; \
 } \
 template<class T> \
 struct has_member_##a {  NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a<T>::value + impl::is_static_member_##a<T>::value + impl::is_static_member_##a<T>::is_constant); }; \
+template<class T, class F> struct has_member_##a##_with_type : bool_constant<has_member_##a<T>::value && is_same<typename impl::is_member_##a<T>::type, F>::value> {}; \
 } \
 }
 

From 3c97ef16f49d821154f10199add05da58e106519 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Wed, 6 Dec 2023 02:40:57 +0300
Subject: [PATCH 33/62] make e_member_presence bitflags

---
 .../nbl/builtin/hlsl/member_test_macros.hlsl  | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl
index cbceb00c9e..36384f46dc 100644
--- a/include/nbl/builtin/hlsl/member_test_macros.hlsl
+++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl
@@ -17,19 +17,11 @@ namespace hlsl
 namespace impl
 {
 
-template<class T, bool C>
-struct is_const_helper : bool_constant<C>
-{
-    using type = T;
-    NBL_CONSTEXPR_STATIC_INLINE bool is_constant = is_const<T>::value;
-};
-
 enum e_member_presence
 {
-    absent = 0,
-    non_static = 1,
-    as_static = 2,
-    static_constexpr = 3,
+    is_present = 1<<0,
+    is_static  = 1<<1,
+    is_const   = 1<<2,
 };
 
 template<class T>
@@ -55,16 +47,16 @@ namespace hlsl \
 { \
 namespace impl { \
 template<class T, class=void>  \
-struct is_static_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false; }; \
+struct is_static_member_##a: false_type { }; \
 template<class T>  \
-struct is_static_member_##a<T,typename enable_if<!is_same<decltype(T::a),void>::value,void>::type>: is_const_helper<decltype(T::a), true> {}; \
+struct is_static_member_##a<T,typename enable_if<!is_same<decltype(T::a),void>::value,void>::type> : true_type {  }; \
 template<class T, class=void> \
-struct is_member_##a: false_type {NBL_CONSTEXPR_STATIC_INLINE bool is_constant = false; using type = void; }; \
+struct is_member_##a: false_type { using type = void; }; \
 template<class T> \
-struct is_member_##a<T,typename enable_if<!is_same<decltype(declval<T>().a),void>::value,void>::type> : is_const_helper<decltype(declval<T>().a), true>{}; \
+struct is_member_##a<T,typename enable_if<!is_same<decltype(declval<T>().a),void>::value,void>::type> : true_type { using type = decltype(declval<T>().a); }; \
 } \
 template<class T> \
-struct has_member_##a {  NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a<T>::value + impl::is_static_member_##a<T>::value + impl::is_static_member_##a<T>::is_constant); }; \
+struct has_member_##a {  NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a<T>::value + 2*impl::is_static_member_##a<T>::value + 4*is_const<typename impl::is_member_##a<T>::type>::value); }; \
 template<class T, class F> struct has_member_##a##_with_type : bool_constant<has_member_##a<T>::value && is_same<typename impl::is_member_##a<T>::type, F>::value> {}; \
 } \
 }

From 06b43afe35054b35078795c67f372b4f6f60c36e Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Wed, 10 Jan 2024 15:43:36 +0100
Subject: [PATCH 34/62] Use new inline SPIR-V builtin syntax from DXC

---
 .../nbl/builtin/hlsl/glsl_compat/core.hlsl    | 12 +++---
 .../hlsl/glsl_compat/subgroup_ballot.hlsl     | 38 +++----------------
 .../hlsl/glsl_compat/subgroup_basic.hlsl      | 23 +++--------
 include/nbl/builtin/hlsl/macros.h             |  2 +-
 .../builtin/hlsl/spirv_intrinsics/core.hlsl   | 16 ++++++++
 .../spirv_intrinsics/subgroup_ballot.hlsl     | 11 ++++++
 .../hlsl/spirv_intrinsics/subgroup_basic.hlsl |  9 ++++-
 7 files changed, 55 insertions(+), 56 deletions(-)

diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
index 3b485ecdd7..92691fdb24 100644
--- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
+++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
@@ -60,12 +60,14 @@ T atomicCompSwap(NBL_REF_ARG(T) ptr, T comparator, T value)
  * For Compute Shaders
  */
 
-// TODO (Future): Its annoying we have to forward declare those, but accessing gl_NumSubgroups and other gl_* values is not yet possible due to https://github.com/microsoft/DirectXShaderCompiler/issues/4217
-// also https://github.com/microsoft/DirectXShaderCompiler/issues/5280
-uint32_t gl_LocalInvocationIndex();
+// TODO: Extemely annoying that HLSL doesn't have referencies, so we can't transparently alias the variables as `const&` :(
+uint32_t3 gl_NumWorkGroups() {return spirv::NumWorkGroups;}
+// TODO: DXC BUG prevents us from defining this!
 uint32_t3 gl_WorkGroupSize();
-uint32_t3 gl_GlobalInvocationID();
-uint32_t3 gl_WorkGroupID();
+uint32_t3 gl_WorkGroupID() {return spirv::WorkgroupId;}
+uint32_t3 gl_LocalInvocationID() {return spirv::LocalInvocationId;}
+uint32_t3 gl_GlobalInvocationID() {return spirv::GlobalInvocationId;}
+uint32_t gl_LocalInvocationIndex() {return spirv::LocalInvocationIndex;}
 
 void barrier() {
     spirv::controlBarrier(spv::ScopeWorkgroup, spv::ScopeWorkgroup, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask);
diff --git a/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl b/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl
index 528b523d9a..ecd888ae2c 100644
--- a/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl
+++ b/include/nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl
@@ -14,38 +14,12 @@ namespace hlsl
 namespace glsl
 {
 
-uint32_t4 gl_SubgroupEqMask()
-{
-    const uint32_t comp = gl_SubgroupInvocationID()>>5;
-    uint32_t4 retval = uint32_t4(0,0,0,0);
-    retval[comp] = 0x1u<<(gl_SubgroupInvocationID()&31u);
-    return retval;
-}
-
-uint32_t4 gl_SubgroupGeMask()
-{
-    const uint32_t FullBits = 0xffffffffu;
-    const uint32_t comp = gl_SubgroupInvocationID()>>5;
-    uint32_t4 retval = uint32_t4(comp>0 ? 0u:FullBits,comp>1 ? 0u:FullBits,comp>2 ? 0u:FullBits,0u);
-    retval[comp] = FullBits<<(gl_SubgroupInvocationID()&31u);
-    return retval;
-}
-
-uint32_t4 gl_SubgroupGtMask()
-{
-    uint32_t4 retval = gl_SubgroupGeMask();
-    const uint32_t comp = gl_SubgroupInvocationID()>>5;
-    retval[comp] = 0xfffffffeu<<(gl_SubgroupInvocationID()&31u);
-    return retval;
-}
-
-uint32_t4 gl_SubgroupLeMask() {
-    return ~gl_SubgroupGtMask();
-}
-
-uint32_t4 gl_SubgroupLtMask() {
-    return ~gl_SubgroupGeMask();
-}
+// TODO: Extemely annoying that HLSL doesn't have referencies, so we can't transparently alias the variables as `const&` :(
+uint32_t4 gl_SubgroupEqMask() {return spirv::BuiltInSubgroupEqMask;}
+uint32_t4 gl_SubgroupGeMask() {return spirv::BuiltInSubgroupGeMask;}
+uint32_t4 gl_SubgroupGtMask() {return spirv::BuiltInSubgroupGtMask;}
+uint32_t4 gl_SubgroupLeMask()  {return spirv::BuiltInSubgroupLeMask;}
+uint32_t4 gl_SubgroupLtMask()  {return spirv::BuiltInSubgroupLtMask;}
 
 template<typename T>
 T subgroupBroadcastFirst(T value)
diff --git a/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl b/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl
index c7feacef6f..b7dc990aa4 100644
--- a/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl
+++ b/include/nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl
@@ -13,25 +13,15 @@ namespace hlsl
 {
 namespace glsl
 {
-
 #ifdef __HLSL_VERSION
-uint32_t gl_SubgroupSize() {
-    return WaveGetLaneCount();
-}
-
-uint32_t gl_SubgroupSizeLog2() {
-    return firstbithigh(gl_SubgroupSize());
-}
-
-uint32_t gl_SubgroupInvocationID() {
-    return WaveGetLaneIndex();
-}
+// TODO: Extemely annoying that HLSL doesn't have referencies, so we can't transparently alias the variables as `const&` :(
+uint32_t gl_SubgroupSize() {return spirv::SubgroupSize;}
+uint32_t gl_SubgroupSizeLog2() {return firstbithigh(spirv::SubgroupSize);}
+uint32_t gl_SubgroupInvocationID() {return spirv::SubgroupLocalInvocationId;}
 
 // only available in compute
-uint32_t gl_SubgroupID() {
-    // TODO (PentaKon): This is not always correct (subgroup IDs aren't always aligned with invocation index per the spec)
-    return gl_LocalInvocationIndex() >> gl_SubgroupSizeLog2();
-}
+uint32_t gl_NumSubgroups() {return spirv::NumSubgroups;}
+uint32_t gl_SubgroupID() {return spirv::SubgroupId;}
 
 bool subgroupElect() {
     return spirv::subgroupElect(spv::ScopeSubgroup);
@@ -57,7 +47,6 @@ void subgroupMemoryBarrierImage() {
     spirv::memoryBarrier(spv::ScopeSubgroup, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsImageMemoryMask);
 }
 #endif
-
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/macros.h b/include/nbl/builtin/hlsl/macros.h
index b48f90eb2f..c9f08738cb 100644
--- a/include/nbl/builtin/hlsl/macros.h
+++ b/include/nbl/builtin/hlsl/macros.h
@@ -29,7 +29,7 @@
 #define NBL_ARG_125(a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25,a26,a27,a28,a29,a30,a31,a32,a33,a34,a35,a36,a37,a38,a39,a40,a41,a42,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52,a53,a54,a55,a56,a57,a58,a59,a60,a61,a62,a63,a64,a65,a66,a67,a68,a69,a70,a71,a72,a73,a74,a75,a76,a77,a78,a79,a80,a81,a82,a83,a84,a85,a86,a87,a88,a89,a90,a91,a92,a93,a94,a95,a96,a97,a98,a99,a100,a101,a102,a103,a104,a105,a106,a107,a108,a109,a110,a111,a112,a113,a114,a115,a116,a117,a118,a119,a120,a121,a122,a123,a124,a125, ... ) a125
 #define NBL_VA_ARGS_COUNT( ... ) NBL_EVAL(NBL_ARG_125(__VA_ARGS__,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0))
 
-//
+// TODO: Use BOOST_PP!
 #define NBL_FOREACH_0(WHAT)
 #define NBL_FOREACH_1(WHAT, X) NBL_EVAL(WHAT(X))
 #define NBL_FOREACH_2(WHAT, X, ...) NBL_EVAL(WHAT(X)NBL_FOREACH_1(WHAT, __VA_ARGS__))
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index e202118e8b..1380355669 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -18,6 +18,22 @@ namespace hlsl
 #ifdef __HLSL_VERSION
 namespace spirv
 {
+[[vk::ext_builtin_input(spv::BuiltInHelperInvocation)]]
+static const bool HelperInvocation;
+
+[[vk::ext_builtin_input(spv::BuiltInNumWorkgroups)]]
+static const uint32_t3 NumWorkGroups;
+// TODO: Doesn't work, find out why and file issue on DXC!
+//[[vk::ext_builtin_input(spv::BuiltInWorkgroupSize)]]
+//static const uint32_t3 WorkgroupSize;
+[[vk::ext_builtin_input(spv::BuiltInWorkgroupId)]]
+static const uint32_t3 WorkgroupId;
+[[vk::ext_builtin_input(spv::BuiltInLocalInvocationId)]]
+static const uint32_t3 LocalInvocationId;
+[[vk::ext_builtin_input(spv::BuiltInGlobalInvocationId)]]
+static const uint32_t3 GlobalInvocationId;
+[[vk::ext_builtin_input(spv::BuiltInLocalInvocationIndex)]]
+static const uint32_t LocalInvocationIndex;
 
 template<typename T>
 T atomicAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl
index cd25c18af7..64c696d3f9 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_ballot.hlsl
@@ -15,6 +15,17 @@ namespace hlsl
 {
 namespace spirv
 {
+[[vk::ext_builtin_input(spv::BuiltInSubgroupEqMask)]]
+static const uint32_t4 BuiltInSubgroupEqMask;
+[[vk::ext_builtin_input(spv::BuiltInSubgroupGeMask)]]
+static const uint32_t4 BuiltInSubgroupGeMask;
+[[vk::ext_builtin_input(spv::BuiltInSubgroupGtMask)]]
+static const uint32_t4 BuiltInSubgroupGtMask;
+[[vk::ext_builtin_input(spv::BuiltInSubgroupLeMask)]]
+static const uint32_t4 BuiltInSubgroupLeMask;
+[[vk::ext_builtin_input(spv::BuiltInSubgroupLtMask)]]
+static const uint32_t4 BuiltInSubgroupLtMask;
+
 template<typename T>
 [[vk::ext_capability( spv::CapabilityGroupNonUniformBallot )]]
 [[vk::ext_instruction( spv::OpGroupNonUniformBroadcastFirst )]]
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl
index 0149f4737b..08d493b87a 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_basic.hlsl
@@ -14,10 +14,17 @@ namespace hlsl
 {
 namespace spirv
 {
+[[vk::ext_builtin_input(spv::BuiltInSubgroupSize)]]
+static const uint32_t SubgroupSize;
+[[vk::ext_builtin_input(spv::BuiltInNumSubgroups)]]
+static const uint32_t NumSubgroups;
+[[vk::ext_builtin_input(spv::BuiltInSubgroupId)]]
+static const uint32_t SubgroupId;
+[[vk::ext_builtin_input(spv::BuiltInSubgroupLocalInvocationId)]]
+static const uint32_t SubgroupLocalInvocationId;
 
 [[vk::ext_instruction( spv::OpGroupNonUniformElect )]]
 bool subgroupElect(uint32_t executionScope);
-
 }
 }
 }

From fd73e2802f1f667e1bf527127564de89e31e8488 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 10:53:45 +0100
Subject: [PATCH 35/62] const correctness on surface capabilities

---
 include/nbl/video/utilities/SPhysicalDeviceFilter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/video/utilities/SPhysicalDeviceFilter.h b/include/nbl/video/utilities/SPhysicalDeviceFilter.h
index ff3b9b743c..ef5d15661d 100644
--- a/include/nbl/video/utilities/SPhysicalDeviceFilter.h
+++ b/include/nbl/video/utilities/SPhysicalDeviceFilter.h
@@ -56,11 +56,11 @@ namespace nbl::video
         //  See vkGetPhysicalDeviceSurfaceSupportKHR
         struct SurfaceCompatibility
         {
-            ISurface* surface = nullptr;
+            const ISurface* surface = nullptr;
             // Setting this to `EQF_NONE` means it sufffices to find any queue family that can present to this surface, regardless of flags it might have
             core::bitflag<IQueue::FAMILY_FLAGS> presentationQueueFlags = IQueue::FAMILY_FLAGS::NONE;
         };
-        SurfaceCompatibility* requiredSurfaceCompatibilities = nullptr;
+        const SurfaceCompatibility* requiredSurfaceCompatibilities = nullptr;
         uint32_t requiredSurfaceCompatibilitiesCount = 0u;
 
 

From 153dd21fa03d0a612a990cc27534d81839a9019c Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 12:25:50 +0100
Subject: [PATCH 36/62] 3D Blit test case was failing because of unimplemented
 functions for the R11G11B10_UFLOAT format, but there are still errors

---
 include/nbl/asset/format/EFormat.h      | 91 ++++++++++++++-----------
 include/nbl/asset/format/encodePixels.h |  1 +
 2 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/include/nbl/asset/format/EFormat.h b/include/nbl/asset/format/EFormat.h
index 1ca7b97dd4..bd91db1d17 100644
--- a/include/nbl/asset/format/EFormat.h
+++ b/include/nbl/asset/format/EFormat.h
@@ -1762,18 +1762,28 @@ inline value_type getFormatMaxValue(E_FORMAT format, uint32_t channel)
     {
         switch (format)
         {
-        case EF_BC6H_SFLOAT_BLOCK: return 32767;
-        case EF_BC6H_UFLOAT_BLOCK: return 65504;
-        default: break;
+            case EF_B10G11R11_UFLOAT_PACK32:
+                if (channel<=1)
+                    return 65520;
+                else if (channel==2)
+                    return 65504;
+                break;
+            case EF_E5B9G9R9_UFLOAT_PACK32:
+                if (channel<3)
+                    return 32704;
+                break;
+            case EF_BC6H_SFLOAT_BLOCK: return 32767;
+            case EF_BC6H_UFLOAT_BLOCK: return 65504;
+            default: break;
         }
 
         auto bytesPerChannel = (getBytesPerPixel(format)*core::rational(1,getFormatChannelCount(format))).getIntegerApprox();
         switch (bytesPerChannel)
         {
-        case 2u: return 65504;
-        case 4u: return FLT_MAX;
-        case 8u: return DBL_MAX;
-        default: break;
+            case 2u: return 65504;
+            case 4u: return FLT_MAX;
+            case 8u: return DBL_MAX;
+            default: break;
         }
     }
     return 0;
@@ -1882,44 +1892,47 @@ inline value_type getFormatPrecision(E_FORMAT format, uint32_t channel, value_ty
     else if (isFloatingPointFormat(format))
     {
         switch (format)
-        {
-        case EF_B10G11R11_UFLOAT_PACK32:
         {
             // unsigned values are always ordered as + 1
-            float f = std::abs(static_cast<float>(value));
-            int bitshft = channel == 2u ? 6 : 5;
-
-            uint16_t f16 = core::Float16Compressor::compress(f);
-            uint16_t enc = f16 >> bitshft;
-            uint16_t next_f16 = (enc + 1) << bitshft;
-
-            return core::Float16Compressor::decompress(next_f16) - f;
-        }
-        case EF_E5B9G9R9_UFLOAT_PACK32:
-            return 0; //TODO
-        default: break;
+            case EF_B10G11R11_UFLOAT_PACK32: [[fallthrough]];
+            case EF_E5B9G9R9_UFLOAT_PACK32: // TODO: probably need to change signature and take all values?
+            {
+                float f = std::abs(static_cast<float>(value));
+                int bitshift;
+                if (format==EF_B10G11R11_UFLOAT_PACK32)
+                    bitshift = channel==2u ? 6:5;
+                else
+                    bitshift = 4;
+
+                uint16_t f16 = core::Float16Compressor::compress(f);
+                uint16_t enc = f16 >> bitshift;
+                uint16_t next_f16 = (enc + 1) << bitshift;
+
+                return core::Float16Compressor::decompress(next_f16) - f;
+            }
+            default: break;
         }
         auto bytesPerChannel = (getBytesPerPixel(format)*core::rational(1,getFormatChannelCount(format))).getIntegerApprox();
         switch (bytesPerChannel)
         {
-        case 2u:
-        {
-            float f = std::abs(static_cast<float>(value));
-            uint16_t f16 = core::Float16Compressor::compress(f);
-            uint16_t dir = core::Float16Compressor::compress(2.f*(f+1.f));
-            return core::Float16Compressor::decompress( core::nextafter16(f16, dir) ) - f;
-        }
-        case 4u:
-        {
-            float f32 = std::abs(static_cast<float>(value));
-            return core::nextafter32(f32,2.f*(f32+1.f))-f32;
-        }
-        case 8u:
-        {
-            double f64 = std::abs(static_cast<double>(value));
-            return core::nextafter64(f64,2.0*(f64+1.0))-f64;
-        }
-        default: break;
+            case 2u:
+            {
+                float f = std::abs(static_cast<float>(value));
+                uint16_t f16 = core::Float16Compressor::compress(f);
+                uint16_t dir = core::Float16Compressor::compress(2.f*(f+1.f));
+                return core::Float16Compressor::decompress( core::nextafter16(f16, dir) ) - f;
+            }
+            case 4u:
+            {
+                float f32 = std::abs(static_cast<float>(value));
+                return core::nextafter32(f32,2.f*(f32+1.f))-f32;
+            }
+            case 8u:
+            {
+                double f64 = std::abs(static_cast<double>(value));
+                return core::nextafter64(f64,2.0*(f64+1.0))-f64;
+            }
+            default: break;
         }
     }
 
diff --git a/include/nbl/asset/format/encodePixels.h b/include/nbl/asset/format/encodePixels.h
index 2db1c08bcb..293bad884f 100644
--- a/include/nbl/asset/format/encodePixels.h
+++ b/include/nbl/asset/format/encodePixels.h
@@ -2488,6 +2488,7 @@ namespace asset
             inp >>= 52;
             inp &= 0x7ffull;
             inp -= (1023ull - 15ull);
+            // TODO: this is wrong, need to get maximum exponent across all 3 input values
             exp = (static_cast<uint32_t>(inp) << 27);
         }
         for (uint32_t i = 0u; i < 3u; ++i)

From bc7e24de81d850b1ce991516c0494081aecc92c1 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 13:23:14 +0100
Subject: [PATCH 37/62] Make the SPhysicalDeviceFilter use spans for
 requirement arrays.

Adjust working examples accordingly

Also correct bad DXC merge
---
 .../video/utilities/SPhysicalDeviceFilter.h   | 61 ++++++++-----------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/include/nbl/video/utilities/SPhysicalDeviceFilter.h b/include/nbl/video/utilities/SPhysicalDeviceFilter.h
index ef5d15661d..d502be8a79 100644
--- a/include/nbl/video/utilities/SPhysicalDeviceFilter.h
+++ b/include/nbl/video/utilities/SPhysicalDeviceFilter.h
@@ -23,8 +23,7 @@ namespace nbl::video
             size_t size = 0ull;
             core::bitflag<IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS> memoryFlags = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS::EMPF_NONE;
         };
-        const MemoryRequirement* memoryRequirements = nullptr;
-        uint32_t memoryRequirementsCount = 0u;
+        std::span<const MemoryRequirement> memoryRequirements = {};
         
         struct QueueRequirement
         {
@@ -49,8 +48,7 @@ namespace nbl::video
             // family's transfer granularity needs to be <=
             asset::VkExtent3D maxImageTransferGranularity = {0x80000000u,0x80000000u,0x80000000u};
         };
-        const QueueRequirement* queueRequirements = nullptr;
-        uint32_t queueRequirementsCount = 0u;
+        std::span<const QueueRequirement> queueRequirements = {};
 
         // To determine whether a queue family of a physical device supports presentation to a given surface
         //  See vkGetPhysicalDeviceSurfaceSupportKHR
@@ -60,8 +58,7 @@ namespace nbl::video
             // Setting this to `EQF_NONE` means it sufffices to find any queue family that can present to this surface, regardless of flags it might have
             core::bitflag<IQueue::FAMILY_FLAGS> presentationQueueFlags = IQueue::FAMILY_FLAGS::NONE;
         };
-        const SurfaceCompatibility* requiredSurfaceCompatibilities = nullptr;
-        uint32_t requiredSurfaceCompatibilitiesCount = 0u;
+        std::span<const SurfaceCompatibility> requiredSurfaceCompatibilities = {};
 
 
         // sift through multiple devices
@@ -120,28 +117,24 @@ namespace nbl::video
                 return false;
 
             // Surface Compatibility
-            if (requiredSurfaceCompatibilities != nullptr)
+            for (const auto& requiredSurfaceCompatibility : requiredSurfaceCompatibilities)
             {
-                for (uint32_t i = 0u; i < requiredSurfaceCompatibilitiesCount; ++i)
-                {
-                    const auto& requiredSurfaceCompatibility = requiredSurfaceCompatibilities[i];
-                    if (requiredSurfaceCompatibility.surface == nullptr)
-                        continue; // we don't care about compatibility with a nullptr surface :)
+                if (requiredSurfaceCompatibility.surface == nullptr)
+                    continue; // we don't care about compatibility with a nullptr surface :)
                     
-                    const auto& queueFamilyProperties = physicalDevice->getQueueFamilyProperties();
-
-                    bool physicalDeviceSupportsSurfaceWithQueueFlags = false;
-                    for (uint32_t qfam = 0u; qfam < queueFamilyProperties.size(); ++qfam)
-                    {
-                        const auto& familyProperty = queueFamilyProperties[qfam];
-                        if(familyProperty.queueFlags.hasFlags(requiredSurfaceCompatibility.presentationQueueFlags))
-                            if(requiredSurfaceCompatibility.surface->isSupportedForPhysicalDevice(physicalDevice, qfam))
-                                physicalDeviceSupportsSurfaceWithQueueFlags = true;
-                    }
-
-                    if(!physicalDeviceSupportsSurfaceWithQueueFlags)
-                        return false;
+                const auto& queueFamilyProperties = physicalDevice->getQueueFamilyProperties();
+
+                bool physicalDeviceSupportsSurfaceWithQueueFlags = false;
+                for (uint32_t qfam = 0u; qfam < queueFamilyProperties.size(); ++qfam)
+                {
+                    const auto& familyProperty = queueFamilyProperties[qfam];
+                    if(familyProperty.queueFlags.hasFlags(requiredSurfaceCompatibility.presentationQueueFlags))
+                        if(requiredSurfaceCompatibility.surface->isSupportedForPhysicalDevice(physicalDevice, qfam))
+                            physicalDeviceSupportsSurfaceWithQueueFlags = true;
                 }
+
+                if(!physicalDeviceSupportsSurfaceWithQueueFlags)
+                    return false;
             }
 
             // Memory Requirements Checking:
@@ -155,25 +148,23 @@ namespace nbl::video
             }
             // over-estimation, Not exact 
             // TODO: Exact or Better Logic -> try find a feasible fitting of requirements into heaps.
-            for (uint32_t m = 0; m < memoryRequirementsCount; ++m)
+            for (const auto& req : memoryRequirements)
             {
-                size_t memSize = memoryRequirements[m].size;
-                for (uint32_t h = 0; h < memoryProps.memoryHeapCount; ++h)
-                    if (heapFlags[h].hasFlags(memoryRequirements[m].memoryFlags))
-                        memSize = (memoryProps.memoryHeaps[h].size > memSize) ? 0ull : memSize - memoryProps.memoryHeaps[h].size;
-                if (memSize > 0)
+                size_t memSize = req.size;
+                for (uint32_t h=0; h<memoryProps.memoryHeapCount; ++h)
+                    if (heapFlags[h].hasFlags(req.memoryFlags))
+                        memSize = memoryProps.memoryHeaps[h].size>memSize ? 0ull:(memSize-memoryProps.memoryHeaps[h].size);
+                if (memSize>0)
                     return false;
             }
             
             // Queue Requirements Checking:
             // over-estimation, Not exact 
             // TODO: Exact or Better Logic -> try find a feasible fitting of requirements into queue families.
-            for (uint32_t q = 0; q < queueRequirementsCount; ++q)
+            for (const auto& queueReqs : queueRequirements)
             {
-                const auto& queueReqs = queueRequirements[q];
                 uint32_t queueCount = queueReqs.queueCount;
-                
-                for (uint32_t qfam = 0; qfam < queueProps.size(); ++qfam)
+                for (uint32_t qfam=0; qfam<queueProps.size(); ++qfam)
                 {
                     const auto& queueFamilyProps = queueProps[qfam];
                     if (queueReqs.familyMatches(queueFamilyProps))

From b234d3b5039852c86217e2a8283d931225683b62 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 14:48:45 +0100
Subject: [PATCH 38/62] ok so I found out that renderdoc hates External memory

---
 include/nbl/video/SPhysicalDeviceLimits.h |  2 +-
 src/nbl/video/CVulkanPhysicalDevice.cpp   | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h
index 7f58a67443..c3e13f145b 100644
--- a/include/nbl/video/SPhysicalDeviceLimits.h
+++ b/include/nbl/video/SPhysicalDeviceLimits.h
@@ -332,7 +332,7 @@ struct SPhysicalDeviceLimits
 
     /* VK_EXT_external_memory_host */
     /* ExternalMemoryHostPropertiesEXT */
-    uint32_t minImportedHostPointerAlignment = 0x1u<<16u;
+    uint32_t minImportedHostPointerAlignment = 0x1u<<31u;
 
     /* ShaderAtomicFloatFeaturesEXT *//* VK_EXT_shader_atomic_float */
     // [REQUIRE] Nabla Core Profile
diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp
index e457ae3a2b..030890b187 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.cpp
+++ b/src/nbl/video/CVulkanPhysicalDevice.cpp
@@ -257,7 +257,7 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
         return availableFeatureSet.find(name)!=availableFeatureSet.end();
     };
     //! Required by Nabla Core Profile
-    if (!isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME))
+    if (!rdoc && !isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME))
         return nullptr;
     if (!isExtensionSupported(VK_EXT_SHADER_ATOMIC_FLOAT_EXTENSION_NAME))
         return nullptr;
@@ -294,7 +294,6 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
         addToPNextChain(&vulkan13Properties);
         //! Required by Nabla Core Profile
         VkPhysicalDeviceExternalMemoryHostPropertiesEXT         externalMemoryHostProperties = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT };
-        addToPNextChain(&externalMemoryHostProperties);
         VkPhysicalDeviceRobustness2PropertiesEXT                robustness2Properties = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT };
         addToPNextChain(&robustness2Properties);
         //! Extensions (ordered by spec extension number)
@@ -314,6 +313,9 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
 #endif
         VkPhysicalDeviceShaderSMBuiltinsPropertiesNV            shaderSMBuiltinsPropertiesNV = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_PROPERTIES_NV };
         VkPhysicalDeviceShaderCoreProperties2AMD                shaderCoreProperties2AMD = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_2_AMD };
+        //! Because Renderdoc is special and instead of ignoring extensions it whitelists them
+        if (isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME))
+            addToPNextChain(&externalMemoryHostProperties);
         //! This is only written for convenience to avoid getting validation errors otherwise vulkan will just skip any strutctures it doesn't recognize
         if (isExtensionSupported(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME))
             addToPNextChain(&conservativeRasterizationProperties);
@@ -504,7 +506,8 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
 
             
         //! Nabla Core Extensions
-        properties.limits.minImportedHostPointerAlignment = externalMemoryHostProperties.minImportedHostPointerAlignment;
+        if (isExtensionSupported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME)) // renderdoc special
+            properties.limits.minImportedHostPointerAlignment = externalMemoryHostProperties.minImportedHostPointerAlignment;
 
         // there's no ShaderAtomicFloatPropertiesEXT 
 
@@ -1398,7 +1401,7 @@ core::smart_refctd_ptr<ILogicalDevice> CVulkanPhysicalDevice::createLogicalDevic
         extensionsToEnable.insert(VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME); // All Requirements Exist in Vulkan 1.1 (including instance extensions)
 #endif
         enableExtensionIfAvailable(VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME);
-        extensionsToEnable.insert(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME);
+        enableExtensionIfAvailable(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME);
         extensionsToEnable.insert(VK_EXT_SHADER_ATOMIC_FLOAT_EXTENSION_NAME);
 
         //! required but has overhead so conditional

From b5a633a6b2d4ba113f739883e50d8dcc2cc74932 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 14:56:17 +0100
Subject: [PATCH 39/62] fix typos causing issues

---
 include/nbl/asset/IFramebuffer.h    | 2 +-
 src/nbl/video/IGPUCommandBuffer.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/asset/IFramebuffer.h b/include/nbl/asset/IFramebuffer.h
index c2d2f21085..99295e7503 100644
--- a/include/nbl/asset/IFramebuffer.h
+++ b/include/nbl/asset/IFramebuffer.h
@@ -96,7 +96,7 @@ class IFramebuffer
                     if (!attachments[i])
                         return true;
                     // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-commonparent
-                    if (rp->isCompatibleDevicewise(attachments[i].get()))
+                    if (!rp->isCompatibleDevicewise(attachments[i].get()))
                         return true;
                     
                     const auto& viewParams = attachments[i]->getCreationParameters();
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 2ede1f2c0f..0f890ffdb8 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -749,7 +749,7 @@ bool IGPUCommandBuffer::pushConstants(const IGPUPipelineLayout* const layout, co
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT))
         return false;
 
-    if (!layout || this->isCompatibleDevicewise(layout))
+    if (!layout || !this->isCompatibleDevicewise(layout))
         return false;
 
     if (!m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CPushConstantsCmd>(m_commandList, core::smart_refctd_ptr<const IGPUPipelineLayout>(layout)))

From 2ab33eda4cb684172a9e0d891012d0cccda0d3df Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 16:44:30 +0100
Subject: [PATCH 40/62] API draft

---
 include/nbl/video/utilities/IUtilities.h | 141 +++++++++++++++--------
 src/nbl/video/utilities/IUtilities.cpp   |   7 ++
 2 files changed, 101 insertions(+), 47 deletions(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 89d45da3f7..2a97610d71 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -211,7 +211,75 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u, params.size, core::smart_refctd_ptr(buffer)}, data, queue);
             return buffer;
         }
+#endif
 
+        struct SIntendedSubmitInfo final
+        {
+            public:
+                inline bool valid() const
+                {
+                    if (!queue || commandBuffers.empty() || signalSemaphores.empty())
+                        return false;
+                    if (!getScratchCommandBuffer()->isResettable())
+                        return false;
+                    if (!getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                        return false;
+                    for (const auto& info : commandBuffers)
+                    if (info.cmdbuf->getPool()->getQueueFamilyIndex()!=queue->getFamilyIndex())
+                        return false;
+                    return true;
+                }
+
+                // Use the last command buffer in intendedNextSubmit, it should be in recording state
+                inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.back().cmdbuf;}
+                inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.back().cmdbuf;}
+
+                inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};}
+
+                inline operator IQueue::SSubmitInfo() const
+                {
+                    return {
+                        .waitSemaphores = waitSemaphores,
+                        .commandBuffers = commandBuffers,
+                        .signalSemaphores = signalSemaphores
+                    };
+                }
+
+                inline void overflowSubmit()
+                {
+                    auto cmdbuf = getScratchCommandBuffer();
+                    auto& scratchSemaphore = signalSemaphores.front();
+                    // but first sumbit the already buffered up copies
+                    cmdbuf->end();
+                    IQueue::SSubmitInfo submit = *this;
+                    // we only signal the last semaphore which is used as scratch
+                    submit.signalSemaphores = {&scratchSemaphore,1};
+                    assert(submit.isValid());
+                    queue->submit({&submit,1});
+                    // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller
+                    {
+                        const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++};
+                        const_cast<ILogicalDevice*>(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1});
+                    }
+                    // we've already waited on the Host for the semaphores, no use waiting twice
+                    waitSemaphores = {};
+                    // since all the commandbuffers have submitted already we only reuse the last one
+                    commandBuffers = {&commandBuffers.back(),1};
+                    // we will still signal the same set in the future
+                    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+                    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+                }
+
+
+                IQueue* queue = {};
+                std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
+                std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
+                std::span<IQueue::SSubmitInfo::SSemaphoreInfo> signalSemaphores = {};
+
+            private:
+                friend class IUtilities;
+                static const char* ErrorText;
+        };
         // --------------
         // updateBufferRangeViaStagingBuffer
         // --------------
@@ -230,7 +298,10 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //!         ** The last command buffer will be used to record the copy commands
         //!     - submissionQueue: IQueue used to submit, when needed. 
         //!         Note: This parameter is required but may not be used if there is no need to submit
-        //!     - submissionFence: 
+        //!     - scratchSemaphore:
+        //!         - since you've already decided on the semaphores you'll wait and signal in the `intendedNextSubmit`, we need an extra semaphore to "stich together" the submit if we split it
+
+
         //!         - This is the fence you will use to submit the copies to, this allows freeing up space in stagingBuffer when the fence is signalled, indicating that the copy has finished.
         //!         - This fence will be in `UNSIGNALED` state after exiting the function. (It will reset after each implicit submit)
         //!         - This fence may be used for CommandBuffer submissions using `submissionQueue` inside the function.
@@ -249,31 +320,26 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //!     * submissionFence must point to a valid IGPUFence
         //!     * submissionFence must be in `UNSIGNALED` state
         //!     ** IUtility::getDefaultUpStreamingBuffer()->cull_frees() should be called before reseting the submissionFence and after fence is signaled. 
-        [[nodiscard("Use The New IQueue::SubmitInfo")]] inline IQueue::SSubmitInfo updateBufferRangeViaStagingBuffer(
-            const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data,
-            IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo intendedNextSubmit
-        )
+        inline bool updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
-            if(!intendedNextSubmit.isValid() || intendedNextSubmit.commandBufferCount <= 0u)
+            if (!bufferRange.isValid() || !bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT))
             {
-                // TODO: log error -> intendedNextSubmit is invalid
-                assert(false);
-                return intendedNextSubmit;
+                m_logger.log("Invalid `bufferRange` or buffer has no `EUF_TRANSFER_DST_BIT` usage flag, cannot `updateBufferRangeViaStagingBuffer`!", system::ILogger::ELL_ERROR);
+                return false;
+            }
+
+            if (!nextSubmit.valid())
+            {
+                m_logger.log(nextSubmit.ErrorText,system::ILogger::ELL_ERROR);
+                return false;
             }
 
             const auto& limits = m_device->getPhysicalDevice()->getLimits();
-            const uint32_t optimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t);
-            
-            // Use the last command buffer in intendedNextSubmit, it should be in recording state
-            auto& cmdbuf = intendedNextSubmit.commandBuffers[intendedNextSubmit.commandBufferCount-1];
-            auto* cmdpool = cmdbuf->getPool();
-            assert(cmdbuf->isResettable());
-            assert(cmdpool->getQueueFamilyIndex() == submissionQueue->getFamilyIndex());
-            assert(cmdbuf->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT));
-            assert(bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT));
+            const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof(uint32_t);
 
+            auto cmdbuf = nextSubmit.getScratchCommandBuffer();
             // no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
-            for (size_t uploadedSize = 0ull; uploadedSize < bufferRange.size;)
+            for (size_t uploadedSize=0ull; uploadedSize<bufferRange.size;)
             {
                 // how much hasn't been uploaded yet
                 const size_t size = bufferRange.size-uploadedSize;
@@ -295,46 +361,28 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 // keep trying again
                 if (localOffset == StreamingTransientDataBufferMT<>::invalid_value)
                 {
-                    // but first sumbit the already buffered up copies
-                    cmdbuf->end();
-                    IQueue::SSubmitInfo submit = intendedNextSubmit;
-                    submit.signalSemaphoreCount = 0u;
-                    submit.pSignalSemaphores = nullptr;
-                    assert(submit.isValid());
-                    submissionQueue->submit(1u, &submit, submissionFence);
-                    m_device->blockForFences(1u, &submissionFence);
-                    intendedNextSubmit.commandBufferCount = 1u;
-                    intendedNextSubmit.commandBuffers = &cmdbuf;
-                    intendedNextSubmit.waitSemaphoreCount = 0u;
-                    intendedNextSubmit.pWaitSemaphores = nullptr;
-                    intendedNextSubmit.pWaitDstStageMask = nullptr;
-                    // before resetting we need poll all events in the allocator's deferred free list
-                    m_defaultUploadBuffer->cull_frees();
-                    // we can reset the fence and commandbuffer because we fully wait for the GPU to finish here
-                    m_device->resetFences(1u, &submissionFence);
-                    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-                    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+                    nextSubmit.overflowSubmit();
                     continue;
                 }
                 // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
                 if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate())
                 {
-                    auto flushRange = AlignedMappedMemoryRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory(),localOffset,subSize,limits.nonCoherentAtomSize);
+                    auto flushRange = AlignedMappedMemoryRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory().memory,localOffset,subSize,limits.nonCoherentAtomSize);
                     m_device->flushMappedMemoryRanges(1u,&flushRange);
                 }
                 // after we make sure writes are in GPU memory (visible to GPU) and not still in a cache, we can copy using the GPU to device-only memory
                 IGPUCommandBuffer::SBufferCopy copy;
                 copy.srcOffset = localOffset;
-                copy.dstOffset = bufferRange.offset + uploadedSize;
+                copy.dstOffset = bufferRange.offset+uploadedSize;
                 copy.size = subSize;
                 cmdbuf->copyBuffer(m_defaultUploadBuffer.get()->getBuffer(), bufferRange.buffer.get(), 1u, &copy);
-                // this doesn't actually free the memory, the memory is queued up to be freed only after the GPU fence/event is signalled
-                m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,core::smart_refctd_ptr<IGPUFence>(submissionFence),&cmdbuf); // can queue with a reset but not yet pending fence, just fine
+                // this doesn't actually free the memory, the memory is queued up to be freed only after the `scratchSemaphore` reaches a value a future submit will signal
+                m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&cmdbuf);
                 uploadedSize += subSize;
             }
-            return intendedNextSubmit;
+            return true;
         }
-
+#if 0
         //! This function is an specialization of the `updateBufferRangeViaStagingBuffer` function above.
         //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit
         //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads
@@ -373,7 +421,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         }
         
         //! This function is an specialization of the `updateBufferRangeViaStagingBufferAutoSubmit` function above.
-        //! Additionally waits for the fence
+        //! Additionally waits for the upload right away
         //! WARNING: This function blocks CPU and stalls the GPU!
         inline void updateBufferRangeViaStagingBufferAutoSubmit(
             const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data,
@@ -391,10 +439,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             updateBufferRangeViaStagingBufferAutoSubmit(bufferRange, data, submissionQueue, fence.get(), submitInfo);
             m_device->blockForFences(1u, &fence.get());
         }
-        
+#endif 
 
         // pipelineBarrierAutoSubmit?
-#endif
 
         // --------------
         // downloadBufferRangeViaStagingBuffer
diff --git a/src/nbl/video/utilities/IUtilities.cpp b/src/nbl/video/utilities/IUtilities.cpp
index e46aec48ad..5ad7612f1e 100644
--- a/src/nbl/video/utilities/IUtilities.cpp
+++ b/src/nbl/video/utilities/IUtilities.cpp
@@ -4,6 +4,13 @@
 
 namespace nbl::video
 {
+const char* IUtilities::SIntendedSubmitInfo::ErrorText = R"===(Invalid `IUtilities::SIntendedSubmitInfo`, possible reasons are:
+- No `commandBuffers` or `signalSemaphores` given in respective spans
+- `commandBuffer.back()` is not Resettable
+- `commandBuffer.back()` is not already begun with ONE_TIME_SUBMIT_BIT
+- one of the `commandBuffer`s' Pool's Queue Family Index doesn't match `queue`'s Family
+)===";
+
 #if 0 // TODO: port
 IQueue::SSubmitInfo IUtilities::updateImageViaStagingBuffer(
     asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, asset::IImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,

From bbc5aa994a133ec0b2a8cc8a6ebe1e0b01a58958 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 17:27:10 +0100
Subject: [PATCH 41/62] think about the other 3 utility functions

---
 include/nbl/video/utilities/IUtilities.h | 131 +++++++++++------------
 1 file changed, 65 insertions(+), 66 deletions(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 2a97610d71..bd4c748c7d 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -194,60 +194,52 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             ));
             return allocationSize;
         }
-
-#if 0 // TODO: port
-        //! WARNING: This function blocks the CPU and stalls the GPU!
-        inline core::smart_refctd_ptr<IGPUBuffer> createFilledDeviceLocalBufferOnDedMem(IQueue* queue, IGPUBuffer::SCreationParams&& params, const void* data)
+        
+        struct SFrontHalfSubmitInfo final
         {
-            if(!params.usage.hasFlags(IGPUBuffer::EUF_TRANSFER_DST_BIT))
-            {
-                assert(false);
-                return nullptr;
-            }
-            auto buffer = m_device->createBuffer(std::move(params));
-            auto mreqs = buffer->getMemoryReqs();
-            mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-            auto mem = m_device->allocate(mreqs, buffer.get());
-            updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u, params.size, core::smart_refctd_ptr(buffer)}, data, queue);
-            return buffer;
-        }
-#endif
+            inline bool valid() const {return queue;}
 
+            // Use the last command buffer in intendedNextSubmit, it should be in recording state
+            inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
+            inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
+
+
+            IQueue* queue = {};
+            std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
+            std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
+        };
+        //! Struct meant to be used with any utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour.
+        //! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. 
+        //! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` !
+        //!     for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied.
         struct SIntendedSubmitInfo final
         {
             public:
                 inline bool valid() const
                 {
-                    if (!queue || commandBuffers.empty() || signalSemaphores.empty())
+                    if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty())
                         return false;
-                    if (!getScratchCommandBuffer()->isResettable())
+                    if (!frontHalf.getScratchCommandBuffer()->isResettable())
                         return false;
-                    if (!getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
-                        return false;
-                    for (const auto& info : commandBuffers)
-                    if (info.cmdbuf->getPool()->getQueueFamilyIndex()!=queue->getFamilyIndex())
+                    if (!frontHalf.getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
                         return false;
                     return true;
                 }
 
-                // Use the last command buffer in intendedNextSubmit, it should be in recording state
-                inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.back().cmdbuf;}
-                inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.back().cmdbuf;}
-
                 inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};}
 
                 inline operator IQueue::SSubmitInfo() const
                 {
                     return {
-                        .waitSemaphores = waitSemaphores,
-                        .commandBuffers = commandBuffers,
+                        .waitSemaphores = frontHalf.waitSemaphores,
+                        .commandBuffers = frontHalf.commandBuffers,
                         .signalSemaphores = signalSemaphores
                     };
                 }
 
                 inline void overflowSubmit()
                 {
-                    auto cmdbuf = getScratchCommandBuffer();
+                    auto cmdbuf = frontHalf.getScratchCommandBuffer();
                     auto& scratchSemaphore = signalSemaphores.front();
                     // but first sumbit the already buffered up copies
                     cmdbuf->end();
@@ -255,31 +247,32 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                     // we only signal the last semaphore which is used as scratch
                     submit.signalSemaphores = {&scratchSemaphore,1};
                     assert(submit.isValid());
-                    queue->submit({&submit,1});
+                    frontHalf.queue->submit({&submit,1});
                     // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller
                     {
                         const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++};
                         const_cast<ILogicalDevice*>(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1});
                     }
                     // we've already waited on the Host for the semaphores, no use waiting twice
-                    waitSemaphores = {};
+                    frontHalf.waitSemaphores = {};
                     // since all the commandbuffers have submitted already we only reuse the last one
-                    commandBuffers = {&commandBuffers.back(),1};
+                    frontHalf.commandBuffers = {&frontHalf.commandBuffers.back(),1};
                     // we will still signal the same set in the future
                     cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
                     cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
                 }
 
 
-                IQueue* queue = {};
-                std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
-                std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
+                //! The last CommandBuffer will be used to record the copy commands
+                SFrontHalfSubmitInfo frontHalf = {};
+                //! The first Semaphore will be used as a scratch, so don't use it yourself as we can advance the counter an arbitrary amount!
                 std::span<IQueue::SSubmitInfo::SSemaphoreInfo> signalSemaphores = {};
 
             private:
                 friend class IUtilities;
                 static const char* ErrorText;
         };
+
         // --------------
         // updateBufferRangeViaStagingBuffer
         // --------------
@@ -287,15 +280,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer`
         //! If the allocation from staging memory fails due to large buffer size or fragmentation then This function may need to submit the command buffer via the `submissionQueue`. 
         //! Returns:
-        //!     IQueue::SSubmitInfo to use for command buffer submission instead of `intendedNextSubmit`. 
-        //!         for example: in the case the `SSubmitInfo::waitSemaphores` were already signalled, the new SSubmitInfo will have it's waitSemaphores emptied from `intendedNextSubmit`.
-        //!     Make sure to submit with the new SSubmitInfo returned by this function
+        //!     the number of times we overflown and had to submit, <0 [negative] on failure
         //! Parameters:
+        //!     - nextSubmit:
+        //!         Is the SubmitInfo you intended to submit your command buffers with, it will be patched if overflow occurred @see SIntendedSubmitInfo
         //!     - bufferRange: contains offset + size into bufferRange::buffer that will be copied from `data` (offset doesn't affect how `data` is accessed)
         //!     - data: raw pointer to data that will be copied to bufferRange::buffer
-        //!     - intendedNextSubmit:
-        //!         Is the SubmitInfo you intended to submit your command buffers.
-        //!         ** The last command buffer will be used to record the copy commands
         //!     - submissionQueue: IQueue used to submit, when needed. 
         //!         Note: This parameter is required but may not be used if there is no need to submit
         //!     - scratchSemaphore:
@@ -320,24 +310,25 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //!     * submissionFence must point to a valid IGPUFence
         //!     * submissionFence must be in `UNSIGNALED` state
         //!     ** IUtility::getDefaultUpStreamingBuffer()->cull_frees() should be called before reseting the submissionFence and after fence is signaled. 
-        inline bool updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
+        inline int64_t updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
             if (!bufferRange.isValid() || !bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT))
             {
                 m_logger.log("Invalid `bufferRange` or buffer has no `EUF_TRANSFER_DST_BIT` usage flag, cannot `updateBufferRangeViaStagingBuffer`!", system::ILogger::ELL_ERROR);
-                return false;
+                return -1;
             }
 
             if (!nextSubmit.valid())
             {
                 m_logger.log(nextSubmit.ErrorText,system::ILogger::ELL_ERROR);
-                return false;
+                return -1;
             }
 
             const auto& limits = m_device->getPhysicalDevice()->getLimits();
             const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof(uint32_t);
 
-            auto cmdbuf = nextSubmit.getScratchCommandBuffer();
+            auto cmdbuf = nextSubmit.frontHalf.getScratchCommandBuffer();
+            int64_t overflowCounter = 0;
             // no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
             for (size_t uploadedSize=0ull; uploadedSize<bufferRange.size;)
             {
@@ -362,6 +353,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 if (localOffset == StreamingTransientDataBufferMT<>::invalid_value)
                 {
                     nextSubmit.overflowSubmit();
+                    overflowCounter++;
                     continue;
                 }
                 // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
@@ -380,9 +372,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&cmdbuf);
                 uploadedSize += subSize;
             }
-            return true;
+            return overflowCounter;
         }
-#if 0
+
         //! This function is an specialization of the `updateBufferRangeViaStagingBuffer` function above.
         //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit
         //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads
@@ -399,16 +391,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //! Valid Usage:
         //!     * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING`
         //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
-        inline void updateBufferRangeViaStagingBufferAutoSubmit(
-            const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data,
-            IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo = {}
-        )
+        inline bool updateBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
-            if(!submitInfo.isValid())
+            if(!nextSubmit.frontHalf.valid())
             {
                 // TODO: log error
-                assert(false);
-                return;
+                return false;
             }
 
             CSubmitInfoPatcher submitInfoPatcher;
@@ -418,28 +406,39 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
 
             assert(submitInfo.isValid());
             submissionQueue->submit(1u,&submitInfo,submissionFence);
+            return true;
         }
         
         //! This function is an specialization of the `updateBufferRangeViaStagingBufferAutoSubmit` function above.
         //! Additionally waits for the upload right away
         //! WARNING: This function blocks CPU and stalls the GPU!
-        inline void updateBufferRangeViaStagingBufferAutoSubmit(
-            const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data,
-            IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {}
-        )
+        inline bool updateBufferRangeViaStagingBufferAutoSubmit(const SFrontHalfSubmitInfo& submit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
-            if(!submitInfo.isValid())
+            if(!submit.valid())
             {
                 // TODO: log error
-                assert(false);
-                return;
+                return false;
             }
 
-            auto fence = m_device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
-            updateBufferRangeViaStagingBufferAutoSubmit(bufferRange, data, submissionQueue, fence.get(), submitInfo);
-            m_device->blockForFences(1u, &fence.get());
+            auto semaphore = m_device->createSemaphore(0);
+            if (!updateBufferRangeViaStagingBufferAutoSubmit(,bufferRange,data))
+                return false;
+            const ISemaphore::SWaitInfo info = {semaphore.get(),1};
+            m_device->blockForSemaphores({&info,1});
+            return true;
+        }
+
+        //! WARNING: This function blocks the CPU and stalls the GPU!
+        inline core::smart_refctd_ptr<IGPUBuffer> createFilledDeviceLocalBufferOnDedMem(const SFrontHalfSubmitInfo& submit, IGPUBuffer::SCreationParams&& params, const void* data)
+        {
+            auto buffer = m_device->createBuffer(std::move(params));
+            auto mreqs = buffer->getMemoryReqs();
+            mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+            auto mem = m_device->allocate(mreqs,buffer.get());
+            if (!updateBufferRangeViaStagingBufferAutoSubmit(submit,asset::SBufferRange<IGPUBuffer>{0u,params.size,core::smart_refctd_ptr(buffer)},data))
+                return nullptr;
+            return buffer;
         }
-#endif 
 
         // pipelineBarrierAutoSubmit?
 

From d41f279d50c8ea129776ea2868e4e4cccde47442 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 19:59:47 +0100
Subject: [PATCH 42/62] design clearing up

---
 include/nbl/video/utilities/IUtilities.h | 155 ++++++++++++-----------
 1 file changed, 81 insertions(+), 74 deletions(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index bd4c748c7d..79a088ef2d 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -195,20 +195,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return allocationSize;
         }
         
-        struct SFrontHalfSubmitInfo final
-        {
-            inline bool valid() const {return queue;}
-
-            // Use the last command buffer in intendedNextSubmit, it should be in recording state
-            inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
-            inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
-
-
-            IQueue* queue = {};
-            std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
-            std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
-        };
-        //! Struct meant to be used with any utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour.
+        //! Struct meant to be used with any Utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour.
         //! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. 
         //! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` !
         //!     for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied.
@@ -219,8 +206,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 {
                     if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty())
                         return false;
+                    // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened 
                     if (!frontHalf.getScratchCommandBuffer()->isResettable())
                         return false;
+                    // It makes no sense to reuse the same commands for a second submission.
+                    // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which
+                    // frees have already been latched on the scratch semaphore you must signal anyway.
                     if (!frontHalf.getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
                         return false;
                     return true;
@@ -263,9 +254,32 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 }
 
 
-                //! The last CommandBuffer will be used to record the copy commands
-                SFrontHalfSubmitInfo frontHalf = {};
-                //! The first Semaphore will be used as a scratch, so don't use it yourself as we can advance the counter an arbitrary amount!
+                //! The last CommandBuffer will be used to record the copy commands    
+                struct SFrontHalf final
+                {
+                    //! We can't check it, but all (if any) all the command buffers except the last one should be in `EXECUTABLE` state.
+                    inline bool valid() const {return queue;}
+
+                    // Use the last command buffer in intendedNextSubmit, it should be in recording state
+                    inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
+                    inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
+
+                    // This parameter is required but may be unused if there is no need to submit
+                    IQueue* queue = {};
+                    // Use this parameter to wait for previous operations to finish before whatever commands the Utility you're using records
+                    std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
+                    // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit,
+                    // for example baked command buffers with pipeline barrier commands.
+                    // ....
+                    std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
+                } frontHalf = {};
+                //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount!
+                //! You can actually examine the change in `signalSemaphore.front().value` to figure out how many overflows occurred.
+                //! This semaphore is needed to "stitch together" additional submits if they occur so they occur before and after the original intended waits and signals.
+                //! We use the first semaphore to keep the intended order of original semaphore signal and waits unchanged no matter how many overflows occur.
+                //! You do however, NEED TO KEEP IT in the signal set of the last submit you're supposed to do manually, this allows freeing any resources used
+                //! after the submit is done, indicating that your streaming routine is done.  
+                //! * Also use this parameter to signal new semaphores so that other submits know your Utility method is done.
                 std::span<IQueue::SSubmitInfo::SSemaphoreInfo> signalSemaphores = {};
 
             private:
@@ -280,55 +294,34 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer`
         //! If the allocation from staging memory fails due to large buffer size or fragmentation then This function may need to submit the command buffer via the `submissionQueue`. 
         //! Returns:
-        //!     the number of times we overflown and had to submit, <0 [negative] on failure
+        //!     True on successful recording of copy commands and handling of overflows, false on failure for any reason.
         //! Parameters:
         //!     - nextSubmit:
         //!         Is the SubmitInfo you intended to submit your command buffers with, it will be patched if overflow occurred @see SIntendedSubmitInfo
         //!     - bufferRange: contains offset + size into bufferRange::buffer that will be copied from `data` (offset doesn't affect how `data` is accessed)
         //!     - data: raw pointer to data that will be copied to bufferRange::buffer
-        //!     - submissionQueue: IQueue used to submit, when needed. 
-        //!         Note: This parameter is required but may not be used if there is no need to submit
-        //!     - scratchSemaphore:
-        //!         - since you've already decided on the semaphores you'll wait and signal in the `intendedNextSubmit`, we need an extra semaphore to "stich together" the submit if we split it
-
-
-        //!         - This is the fence you will use to submit the copies to, this allows freeing up space in stagingBuffer when the fence is signalled, indicating that the copy has finished.
-        //!         - This fence will be in `UNSIGNALED` state after exiting the function. (It will reset after each implicit submit)
-        //!         - This fence may be used for CommandBuffer submissions using `submissionQueue` inside the function.
-        //!         ** NOTE: This fence will be signalled everytime there is a submission inside this function, which may be more than one until the job is finished.
         //! Valid Usage:
+        //!     * nextSubmit must be valid (see `SIntendedSubmitInfo::valid()`)
+        //!     * bufferRange must be valid (see `SBufferRange::isValid()`)
         //!     * data must not be nullptr
-        //!     * bufferRange should be valid (see SBufferRange::isValid())
-        //!     * intendedNextSubmit::commandBufferCount must be > 0
-        //!     * The commandBuffers should have been allocated from a CommandPool with the same queueFamilyIndex as `submissionQueue`
-        //!     * The last command buffer should be in `RECORDING` state.
-        //!     * The last command buffer should be must've called "begin()" with `IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT` flag
-        //!         The reason is the commands recorded into the command buffer would not be valid for a second submission and the stagingBuffer memory wouldv'e been freed/changed.
-        //!     * The last command buffer should be "resettable". See `ICommandBuffer::E_STATE` comments
-        //!     * To ensure correct execution order, (if any) all the command buffers except the last one should be in `EXECUTABLE` state.
-        //!     * submissionQueue must point to a valid IQueue
-        //!     * submissionFence must point to a valid IGPUFence
-        //!     * submissionFence must be in `UNSIGNALED` state
-        //!     ** IUtility::getDefaultUpStreamingBuffer()->cull_frees() should be called before reseting the submissionFence and after fence is signaled. 
-        inline int64_t updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
+        inline bool updateBufferRangeViaStagingBuffer(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
             if (!bufferRange.isValid() || !bufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_DST_BIT))
             {
                 m_logger.log("Invalid `bufferRange` or buffer has no `EUF_TRANSFER_DST_BIT` usage flag, cannot `updateBufferRangeViaStagingBuffer`!", system::ILogger::ELL_ERROR);
-                return -1;
+                return false;
             }
 
             if (!nextSubmit.valid())
             {
                 m_logger.log(nextSubmit.ErrorText,system::ILogger::ELL_ERROR);
-                return -1;
+                return false;
             }
 
             const auto& limits = m_device->getPhysicalDevice()->getLimits();
             const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof(uint32_t);
 
             auto cmdbuf = nextSubmit.frontHalf.getScratchCommandBuffer();
-            int64_t overflowCounter = 0;
             // no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
             for (size_t uploadedSize=0ull; uploadedSize<bufferRange.size;)
             {
@@ -353,7 +346,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 if (localOffset == StreamingTransientDataBufferMT<>::invalid_value)
                 {
                     nextSubmit.overflowSubmit();
-                    overflowCounter++;
                     continue;
                 }
                 // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
@@ -372,18 +364,51 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&cmdbuf);
                 uploadedSize += subSize;
             }
-            return overflowCounter;
+            return true;
         }
 
-        //! This function is an specialization of the `updateBufferRangeViaStagingBuffer` function above.
-        //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit
+        //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission
+        //! to `intendedSubmit.queue` happening automatically, no need for the user to handle the submit at the end.
         //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads
+        //!  of the `updateBufferRangeViaStagingBufferAutoSubmit` function above.
         //! Parameters:
-        //! - `submitInfo`: IQueue::SSubmitInfo used to submit the copy operations.
-        //!     * Use this parameter to wait for previous operations to finish using submitInfo::waitSemaphores or signal new semaphores using submitInfo::signalSemaphores
-        //!     * Fill submitInfo::commandBuffers with the commandbuffers you want to be submitted before the copy in this struct as well, for example pipeline barrier commands.
-        //!     * Empty by default: waits for no semaphore and signals no semaphores.
-        //! Patches the submitInfo::commandBuffers
+        //! - `intendedSubmit`: more lax than regular `SIntendedSubmitInfo::valid()`, only needs a valid queue and at least one semaphore to signal (how else will you know you're done?)
+        //!     since the submit must and will happen, there's no point updating the semaphore and commandbuffer info spans in the intendedSubmit
+        inline bool autoSubmit(const SIntendedSubmitInfo& intendedSubmit, const std::function<bool(SIntendedSubmitInfo&)>& what)
+        {
+            if (!intendedSubmit.frontHalf.valid() || intendedSubmit.signalSemaphores.empty())
+            {
+                // TODO: log error
+                return false;
+            }
+
+            SIntendedSubmitInfo patchedSubmit = intendedSubmit;
+            if (!what(patchedSubmit))
+                return false;
+            const IQueue::SSubmitInfo submit = patchedSubmit;
+            return intendedSubmit.frontHalf.queue->submit({&submit,1});
+        }
+
+        //! This function is an specialization of the `autoSubmit` function above, it will additionally wait on the Host (CPU) for the final submit to finish.
+        //! WARNING: This function blocks CPU and stalls the GPU!
+        inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function<bool(SIntendedSubmitInfo&)>& what)
+        {            
+            auto semaphore = m_device->createSemaphore(0);
+            // so we begin latching everything on the value of 1, but if we overflow it increases
+            IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1};
+
+            SIntendedSubmitInfo intendedSubmit = {.frontHalf=submit,.signalSemaphores={&info,1}};
+            if (!autoSubmit(intendedSubmit,what))
+                return false;
+            
+            // Watch carefully and note that we might not be waiting on the value of `1` for why @see `SIntendedSubmitInfo::signalSemaphores`
+            const ISemaphore::SWaitInfo waitInfo = {info.semaphore,info.value};
+            m_device->blockForSemaphores({&waitInfo,1});
+            return true;
+        }
+
+#if 0 
+        //! Patches the intendedSubmit::frontHalf::commandBuffers
         //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands
         //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission
         //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands.
@@ -408,34 +433,16 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             submissionQueue->submit(1u,&submitInfo,submissionFence);
             return true;
         }
-        
-        //! This function is an specialization of the `updateBufferRangeViaStagingBufferAutoSubmit` function above.
-        //! Additionally waits for the upload right away
-        //! WARNING: This function blocks CPU and stalls the GPU!
-        inline bool updateBufferRangeViaStagingBufferAutoSubmit(const SFrontHalfSubmitInfo& submit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
-        {
-            if(!submit.valid())
-            {
-                // TODO: log error
-                return false;
-            }
-
-            auto semaphore = m_device->createSemaphore(0);
-            if (!updateBufferRangeViaStagingBufferAutoSubmit(,bufferRange,data))
-                return false;
-            const ISemaphore::SWaitInfo info = {semaphore.get(),1};
-            m_device->blockForSemaphores({&info,1});
-            return true;
-        }
+#endif        
 
         //! WARNING: This function blocks the CPU and stalls the GPU!
-        inline core::smart_refctd_ptr<IGPUBuffer> createFilledDeviceLocalBufferOnDedMem(const SFrontHalfSubmitInfo& submit, IGPUBuffer::SCreationParams&& params, const void* data)
+        inline core::smart_refctd_ptr<IGPUBuffer> createFilledDeviceLocalBufferOnDedMem(const SIntendedSubmitInfo::SFrontHalf& submit, IGPUBuffer::SCreationParams&& params, const void* data)
         {
             auto buffer = m_device->createBuffer(std::move(params));
             auto mreqs = buffer->getMemoryReqs();
             mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
             auto mem = m_device->allocate(mreqs,buffer.get());
-            if (!updateBufferRangeViaStagingBufferAutoSubmit(submit,asset::SBufferRange<IGPUBuffer>{0u,params.size,core::smart_refctd_ptr(buffer)},data))
+            if (!autoSubmitAndBlock(submit,[&](auto& info){return updateBufferRangeViaStagingBuffer(info,asset::SBufferRange<IGPUBuffer>{0u,params.size,core::smart_refctd_ptr(buffer)},data);}))
                 return nullptr;
             return buffer;
         }

From 04d05da31a29c9f1ee52f88a5dfc3a1138d019bc Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 22:17:45 +0100
Subject: [PATCH 43/62] Ok we're done here with the Streaming Buffer upload
 port (removed the IUtilities method triples that only did less sophisticated
 submits: patched and blocking)

See `createFilledDeviceLocalBufferOnDedMem` for how to wrap a utility method in a lambda and achieve the same result
---
 include/nbl/video/IQueue.h               |   3 +-
 include/nbl/video/utilities/IUtilities.h | 362 ++++++++++-------------
 2 files changed, 158 insertions(+), 207 deletions(-)

diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h
index 232f4c6547..654d95a847 100644
--- a/include/nbl/video/IQueue.h
+++ b/include/nbl/video/IQueue.h
@@ -109,7 +109,8 @@ class IQueue : public core::Interface, public core::Unmovable
         virtual RESULT waitIdle() const = 0;
 
         // we cannot derive from IBackendObject because we can't derive from IReferenceCounted
-        inline bool wasCreatedBy(const ILogicalDevice* device) const { return device == m_originDevice; }
+        inline const ILogicalDevice* getOriginDevice() const {return m_originDevice;}
+        inline bool wasCreatedBy(const ILogicalDevice* device) const {return device==m_originDevice;}
         // Vulkan: const VkQueue*
         virtual const void* getNativeHandle() const = 0;
 
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 79a088ef2d..55af9a3750 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -206,13 +206,16 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 {
                     if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty())
                         return false;
+                    const auto* scratch = frontHalf.getScratchCommandBuffer();
                     // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened 
-                    if (!frontHalf.getScratchCommandBuffer()->isResettable())
+                    if (!scratch->isResettable())
                         return false;
                     // It makes no sense to reuse the same commands for a second submission.
                     // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which
                     // frees have already been latched on the scratch semaphore you must signal anyway.
-                    if (!frontHalf.getScratchCommandBuffer()->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                    if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                        return false;
+                    if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL)
                         return false;
                     return true;
                 }
@@ -228,6 +231,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                     };
                 }
 
+                // One thing you might notice is that this results in a few implicit Memory and Execution Dependencies
+                // So there's a little bit of non-deterministic behaviour we won't fight (will not insert a barrier every time you "could-have" overflown)
                 inline void overflowSubmit()
                 {
                     auto cmdbuf = frontHalf.getScratchCommandBuffer();
@@ -257,8 +262,93 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 //! The last CommandBuffer will be used to record the copy commands    
                 struct SFrontHalf final
                 {
-                    //! We can't check it, but all (if any) all the command buffers except the last one should be in `EXECUTABLE` state.
-                    inline bool valid() const {return queue;}
+                    //! Need a valid queue and all the command buffers except the last one should be in `EXECUTABLE` state.
+                    inline bool valid() const
+                    {
+                        if (!queue)
+                            return false;
+                        if (!commandBuffers.empty())
+                        for (size_t i=0; i<commandBuffers.size()-1; i++)
+                        if (commandBuffers[i].cmdbuf->getState()==IGPUCommandBuffer::STATE::EXECUTABLE)
+                            return false;
+                        return true;
+                    }
+
+                    //! Little class to hold the storage for the modified commandbuffer span until submission time.
+                    class CRAIISpanPatch final : core::Uncopyable
+                    {
+                        public:
+                            inline ~CRAIISpanPatch()
+                            {
+                                toNullify->commandBuffers = {};
+                            }
+                            inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));}
+                            inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs)
+                            {
+                                commandBuffersStorage = std::move(rhs.commandBuffersStorage);
+                                return *this;
+                            }
+
+                            inline operator bool() const {return m_recordingCommandBuffer.get();}
+
+                        private:
+                            friend SFrontHalf;
+                            inline CRAIISpanPatch() = default;
+                            inline CRAIISpanPatch(SFrontHalf* _toNull) : commandBuffersStorage(_toNull->commandBuffers.size()+1), toNullify(_toNull) {}
+
+                            core::vector<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffersStorage;
+                            // If we made a new commandbuffer we need to nullify the span so it doesn't point at stale mem
+                            SFrontHalf* toNullify = nullptr;
+                            // If new one made, then need to hold reference to it, else its just an extra ref, but whatever
+                            core::smart_refctd_ptr<IGPUCommandBuffer> m_recordingCommandBuffer;
+                    };
+                    //! Patches the `commandBuffers` and then makes sure the last command buffer is resettable, in recording state begun with ONE_TIME_SUBMIT
+                    //! If we can't make the last cmdbuffer that way, we make a new one and add it onto the end (hence the name "patching")
+                    //! If `commandBuffers.empty()`, it will create an implicit command buffer to use for recording commands,
+                    //! else if the last command buffer is not feasible to use as scratch for whatever reason,
+                    //! it will add another temporary command buffer to end of `commandBuffers` and use it for recording.
+                    //! WARNING: If patching occurs:
+                    //!     - a submission must occur before the return value goes out of scope!
+                    //!     - if `!commandBuffers.empty()`, the last CommandBuffer won't be in the same state as it was before entering the function,
+                    //!         because it needs to be `end()`ed before the submission
+                    //!     - the destructor of the return value will clear `commandBuffers` span
+                    //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
+                    [[nodiscard("The RAII object returned by `patch()` provides lifetimes to your spans!")]]
+                    inline CRAIISpanPatch patch()
+                    {
+                        if (auto* candidateScratch = getScratchCommandBuffer(); candidateScratch && candidateScratch->isResettable())
+                        switch(candidateScratch->getState())
+                        {
+                            case IGPUCommandBuffer::STATE::INITIAL:
+                                if (!candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                                    break;
+                                [[fallthrough]];
+                            case IGPUCommandBuffer::STATE::RECORDING:
+                                if (!candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                                    break;
+                                {
+                                    CRAIISpanPatch retval;
+                                    retval.m_recordingCommandBuffer = core::smart_refctd_ptr<IGPUCommandBuffer>(candidateScratch);
+                                    return retval;
+                                }
+                                break;
+                            default:
+                                break;
+                        }
+
+                        CRAIISpanPatch retval(this);
+                        std::copy(commandBuffers.begin(),commandBuffers.end(),retval.commandBuffersStorage.begin());
+                        {
+                            auto pool = const_cast<ILogicalDevice*>(queue->getOriginDevice())->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+                            if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&retval.m_recordingCommandBuffer,1}))
+                                return {};
+                            if (!retval.m_recordingCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                                return {};
+                            retval.commandBuffersStorage.back().cmdbuf = retval.m_recordingCommandBuffer.get();
+                        }
+                        commandBuffers = retval.commandBuffersStorage;
+                        return retval;
+                    }
 
                     // Use the last command buffer in intendedNextSubmit, it should be in recording state
                     inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
@@ -270,7 +360,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                     std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
                     // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit,
                     // for example baked command buffers with pipeline barrier commands.
-                    // ....
+                    // Also remember that even though the last CommandBuffer is scratch, it you can record commands into it as well.
                     std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
                 } frontHalf = {};
                 //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount!
@@ -287,6 +377,61 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 static const char* ErrorText;
         };
 
+
+        //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission
+        //! to `intendedSubmit.queue` happening automatically, no need for the user to handle the submit at the end.
+        //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads
+        //!  of the `updateBufferRangeViaStagingBufferAutoSubmit` function above.
+        //! Parameters:
+        //! - `intendedSubmit`: more lax than regular `SIntendedSubmitInfo::valid()`, only needs a valid queue and at least one semaphore to use as scratch and signal.
+        //!     if you don't have a commandbuffer usable as scratch as the last one, we'll patch internally.
+        inline IQueue::RESULT autoSubmit(SIntendedSubmitInfo& intendedSubmit, const std::function<bool(SIntendedSubmitInfo&)>& what)
+        {
+            if (!intendedSubmit.frontHalf.valid() || intendedSubmit.signalSemaphores.empty())
+            {
+                // TODO: log error
+                return IQueue::RESULT::OTHER_ERROR;
+            }
+
+            const auto raii = intendedSubmit.frontHalf.patch();
+            if (!raii)
+            {
+                // TODO: log error
+                return IQueue::RESULT::OTHER_ERROR;
+            }
+
+            if (!what(intendedSubmit))
+                return IQueue::RESULT::OTHER_ERROR;
+            intendedSubmit.frontHalf.getScratchCommandBuffer()->end();
+
+            const IQueue::SSubmitInfo submit = intendedSubmit;
+            if (const auto error=intendedSubmit.frontHalf.queue->submit({&submit,1}); error!=IQueue::RESULT::SUCCESS)
+                return error;
+            // If there's any subsequent submit in a chain, make sure it waits for this one to finish
+            // (to achieve a command ordering in the cmdbuffer transparent to overflow submits)
+            intendedSubmit.frontHalf.waitSemaphores = {&intendedSubmit.signalSemaphores.front(),1};
+            intendedSubmit.signalSemaphores = {};
+            return IQueue::RESULT::SUCCESS;
+        }
+
+        //! This function is an specialization of the `autoSubmit` function above, it will additionally wait on the Host (CPU) for the final submit to finish.
+        //! WARNING: This function blocks CPU and stalls the GPU!
+        inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function<bool(SIntendedSubmitInfo&)>& what)
+        {            
+            auto semaphore = m_device->createSemaphore(0);
+            // so we begin latching everything on the value of 1, but if we overflow it increases
+            IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1};
+
+            SIntendedSubmitInfo intendedSubmit = {.frontHalf=submit,.signalSemaphores={&info,1}};
+            if (autoSubmit(intendedSubmit,what)!=IQueue::RESULT::SUCCESS)
+                return false;
+            
+            // Watch carefully and note that we might not be waiting on the value of `1` for why @see `SIntendedSubmitInfo::signalSemaphores`
+            const ISemaphore::SWaitInfo waitInfo = {info.semaphore,info.value};
+            m_device->blockForSemaphores({&waitInfo,1});
+            return true;
+        }  
+
         // --------------
         // updateBufferRangeViaStagingBuffer
         // --------------
@@ -367,74 +512,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return true;
         }
 
-        //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission
-        //! to `intendedSubmit.queue` happening automatically, no need for the user to handle the submit at the end.
-        //! WARNING: Don't use this function in hot loops or to do batch updates, its merely a convenience for one-off uploads
-        //!  of the `updateBufferRangeViaStagingBufferAutoSubmit` function above.
-        //! Parameters:
-        //! - `intendedSubmit`: more lax than regular `SIntendedSubmitInfo::valid()`, only needs a valid queue and at least one semaphore to signal (how else will you know you're done?)
-        //!     since the submit must and will happen, there's no point updating the semaphore and commandbuffer info spans in the intendedSubmit
-        inline bool autoSubmit(const SIntendedSubmitInfo& intendedSubmit, const std::function<bool(SIntendedSubmitInfo&)>& what)
-        {
-            if (!intendedSubmit.frontHalf.valid() || intendedSubmit.signalSemaphores.empty())
-            {
-                // TODO: log error
-                return false;
-            }
-
-            SIntendedSubmitInfo patchedSubmit = intendedSubmit;
-            if (!what(patchedSubmit))
-                return false;
-            const IQueue::SSubmitInfo submit = patchedSubmit;
-            return intendedSubmit.frontHalf.queue->submit({&submit,1});
-        }
-
-        //! This function is an specialization of the `autoSubmit` function above, it will additionally wait on the Host (CPU) for the final submit to finish.
-        //! WARNING: This function blocks CPU and stalls the GPU!
-        inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function<bool(SIntendedSubmitInfo&)>& what)
-        {            
-            auto semaphore = m_device->createSemaphore(0);
-            // so we begin latching everything on the value of 1, but if we overflow it increases
-            IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1};
-
-            SIntendedSubmitInfo intendedSubmit = {.frontHalf=submit,.signalSemaphores={&info,1}};
-            if (!autoSubmit(intendedSubmit,what))
-                return false;
-            
-            // Watch carefully and note that we might not be waiting on the value of `1` for why @see `SIntendedSubmitInfo::signalSemaphores`
-            const ISemaphore::SWaitInfo waitInfo = {info.semaphore,info.value};
-            m_device->blockForSemaphores({&waitInfo,1});
-            return true;
-        }
-
-#if 0 
-        //! Patches the intendedSubmit::frontHalf::commandBuffers
-        //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands
-        //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission
-        //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands.
-        //! WARNING: If commandBufferCount > 0, The last commandBuffer won't be in the same state as it was before entering the function, because it needs to be `end()`ed and submitted
-        //! Valid Usage:
-        //!     * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING`
-        //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
-        inline bool updateBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
-        {
-            if(!nextSubmit.frontHalf.valid())
-            {
-                // TODO: log error
-                return false;
-            }
-
-            CSubmitInfoPatcher submitInfoPatcher;
-            submitInfoPatcher.patchAndBegin(submitInfo, m_device, submissionQueue->getFamilyIndex());
-            submitInfo = updateBufferRangeViaStagingBuffer(bufferRange,data,submissionQueue,submissionFence,submitInfo);
-            submitInfoPatcher.end();
-
-            assert(submitInfo.isValid());
-            submissionQueue->submit(1u,&submitInfo,submissionFence);
-            return true;
-        }
-#endif        
-
         //! WARNING: This function blocks the CPU and stalls the GPU!
         inline core::smart_refctd_ptr<IGPUBuffer> createFilledDeviceLocalBufferOnDedMem(const SIntendedSubmitInfo::SFrontHalf& submit, IGPUBuffer::SCreationParams&& params, const void* data)
         {
@@ -629,42 +706,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return intendedNextSubmit;
         }
 
-        //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above.
-        //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit
-        //! Parameters:
-        //! - `submitInfo`: IQueue::SSubmitInfo used to submit the copy operations.
-        //!     * Use this parameter to wait for previous operations to finish using submitInfo::waitSemaphores or signal new semaphores using submitInfo::signalSemaphores
-        //!     * Fill submitInfo::commandBuffers with the commandbuffers you want to be submitted before the copy in this struct as well, for example pipeline barrier commands.
-        //!     * Empty by default: waits for no semaphore and signals no semaphores.
-        //! Patches the submitInfo::commandBuffers
-        //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands
-        //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission
-        //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands.
-        //! WARNING: If commandBufferCount > 0, The last commandBuffer won't be in the same state as it was before entering the function, because it needs to be `end()`ed and submitted
-        //! Valid Usage:
-        //!     * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING`
-        //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
-        inline void downloadBufferRangeViaStagingBufferAutoSubmit(
-            const std::function<data_consumption_callback_t>& consumeCallback, const asset::SBufferRange<IGPUBuffer>& srcBufferRange,
-            IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo = {}
-        )
-        {
-            if (!submitInfo.isValid())
-            {
-                // TODO: log error
-                assert(false);
-                return;
-            }
-
-            CSubmitInfoPatcher submitInfoPatcher;
-            submitInfoPatcher.patchAndBegin(submitInfo, m_device, submissionQueue->getFamilyIndex());
-            submitInfo = downloadBufferRangeViaStagingBuffer(consumeCallback, srcBufferRange, submissionQueue, submissionFence, submitInfo);
-            submitInfoPatcher.end();
-
-            assert(submitInfo.isValid());
-            submissionQueue->submit(1u, &submitInfo, submissionFence);
-        }
-
         //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above.
         //! Additionally waits for the fence
         //! WARNING: This function blocks CPU and stalls the GPU!
@@ -686,13 +727,14 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             auto* fenceptr = fence.get();
             m_device->blockForFences(1u, &fenceptr);
 
-            m_defaultDownloadBuffer->cull_frees();
+            //! TODO: NOTE this method cannot be turned into a pure autoSubmitAndBlock + lambda because there's stuff to do AFTER the semaphore wait~! 
+            m_defaultDownloadBuffer->cull_frees(); // its while(poll()) {} now IIRC
         }
-        
+#endif        
         // --------------
         // buildAccelerationStructures
         // --------------
-
+#if 0 // TODO: port later when we have an example
         //! WARNING: This function blocks the CPU and stalls the GPU!
         inline void buildAccelerationStructures(IQueue* queue, const core::SRange<const IGPUAccelerationStructure::DeviceBuildGeometryInfo>& pInfos, IGPUAccelerationStructure::BuildRangeInfo* const* ppBuildRangeInfos)
         {
@@ -717,11 +759,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         
             m_device->blockForFences(1u,&fence.get());
         }
-
+#endif
         // --------------
         // updateImageViaStagingBuffer
         // --------------
-
+#if 0 // TODO: port
         //! Copies `srcBuffer` to stagingBuffer and Records the commands needed to copy the image from stagingBuffer to `dstImage`
         //! If the allocation from staging memory fails due to large image size or fragmentation then This function may need to submit the command buffer via the `submissionQueue` and then signal the fence. 
         //! Returns:
@@ -765,33 +807,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         [[nodiscard("Use The New IQueue::SubmitInfo")]] IQueue::SSubmitInfo updateImageViaStagingBuffer(
             asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,
             IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo intendedNextSubmit);
-        
-        //! This function is an specialization of the `updateImageViaStagingBuffer` function above.
-        //! Submission of the commandBuffer to submissionQueue happens automatically, no need for the user to handle submit
-        //! Parameters:
-        //! - `submitInfo`: IQueue::SSubmitInfo used to submit the copy operations.
-        //!     * Use this parameter to wait for previous operations to finish using submitInfo::waitSemaphores or signal new semaphores using submitInfo::signalSemaphores
-        //!     * Fill submitInfo::commandBuffers with the commandbuffers you want to be submitted before the copy in this struct as well, for example pipeline barrier commands.
-        //!     * Empty by default: waits for no semaphore and signals no semaphores.
-        //! Patches the submitInfo::commandBuffers
-        //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands
-        //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission
-        //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands.
-        //! WARNING: If commandBufferCount > 0, The last commandBuffer won't be in the same state as it was before entering the function, because it needs to be `end()`ed and submitted
-        //! Valid Usage:
-        //!     * If submitInfo::commandBufferCount > 0 and the last command buffer must be in one of these stages: `EXECUTABLE`, `INITIAL`, `RECORDING`
-        //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
-        void updateImageViaStagingBufferAutoSubmit(
-            asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,
-            IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo = {});
-
-        //! This function is an specialization of the `updateImageViaStagingBufferAutoSubmit` function above.
-        //! Additionally waits for the fence
-        //! WARNING: This function blocks CPU and stalls the GPU!
-        void updateImageViaStagingBufferAutoSubmit(
-            asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, IGPUImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,
-            IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {}
-        );
 #endif
 
     protected:        
@@ -806,72 +821,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return range;
         }
 
-#if 0 // TODO: port
-        //! Internal tool used to patch command buffers in submit info.
-        class CSubmitInfoPatcher
-        {
-        public:
-            //! Patches the submitInfo::commandBuffers and then makes sure the last command buffer is in recording state
-            //! If submitInfo::commandBufferCount == 0, it will create an implicit command buffer to use for recording copy commands
-            //! If submitInfo::commandBufferCount > 0 the last command buffer is in `EXECUTABLE` state, It will add another temporary command buffer to end of the array and use it for recording and submission
-            //! If submitInfo::commandBufferCount > 0 the last command buffer is in `RECORDING` or `INITIAL` state, It won't add another command buffer and uses the last command buffer for the copy commands.
-            //! Params:
-            //!     - submitInfo: IQueue::SSubmitInfo to patch
-            //!     - device: logical device to create new command pool and command buffer if necessary.
-            //!     - newCommandPoolFamIdx: family index to create commandPool with if necessary.
-            inline void patchAndBegin(IQueue::SSubmitInfo& submitInfo, core::smart_refctd_ptr<ILogicalDevice> device, uint32_t newCommandPoolFamIdx)
-            {
-                bool needToCreateNewCommandBuffer = false;
 
-                if (submitInfo.commandBufferCount <= 0u)
-                    needToCreateNewCommandBuffer = true;
-                else
-                {
-                    auto lastCmdBuf = submitInfo.commandBuffers[submitInfo.commandBufferCount - 1u];
-                    if (lastCmdBuf->getState() == IGPUCommandBuffer::STATE::EXECUTABLE)
-                        needToCreateNewCommandBuffer = true;
-                }
-
-                // commandBuffer used to record the commands
-                if (needToCreateNewCommandBuffer)
-                {
-                    core::smart_refctd_ptr<IGPUCommandPool> pool = device->createCommandPool(newCommandPoolFamIdx, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-                    device->createCommandBuffers(pool.get(), IGPUCommandBuffer::LEVEL::PRIMARY, 1u, &m_newCommandBuffer);
-
-                    const uint32_t newCommandBufferCount = (needToCreateNewCommandBuffer) ? submitInfo.commandBufferCount + 1 : submitInfo.commandBufferCount;
-                    m_allCommandBuffers.resize(newCommandBufferCount);
-
-                    for (uint32_t i = 0u; i < submitInfo.commandBufferCount; ++i)
-                        m_allCommandBuffers[i] = submitInfo.commandBuffers[i];
-
-                    m_recordCommandBuffer = m_newCommandBuffer.get();
-                    m_allCommandBuffers[newCommandBufferCount - 1u] = m_recordCommandBuffer;
-
-                    submitInfo.commandBufferCount = newCommandBufferCount;
-                    submitInfo.commandBuffers = m_allCommandBuffers.data();
-
-                    m_recordCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-                }
-                else
-                {
-                    m_recordCommandBuffer = submitInfo.commandBuffers[submitInfo.commandBufferCount - 1u];
-                    // If the last command buffer is in INITIAL state, bring it to RECORDING state
-                    if (m_recordCommandBuffer->getState() == IGPUCommandBuffer::STATE::INITIAL)
-                        m_recordCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-                }
-            }
-            inline void end()
-            {
-                m_recordCommandBuffer->end();
-            }
-            inline IGPUCommandBuffer* getRecordingCommandBuffer() { return m_recordCommandBuffer; }
-
-        private:
-            IGPUCommandBuffer* m_recordCommandBuffer;
-            core::vector<IGPUCommandBuffer*> m_allCommandBuffers;
-            core::smart_refctd_ptr<IGPUCommandBuffer> m_newCommandBuffer; // if necessary, then need to hold reference to.
-        };
-#endif
         core::smart_refctd_ptr<ILogicalDevice> m_device;
 
         core::smart_refctd_ptr<StreamingTransientDataBufferMT<> > m_defaultDownloadBuffer;

From 3d034c546fa5d44d43b2b31f319ae789320939b9 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 22:32:29 +0100
Subject: [PATCH 44/62] move the SIntendedSubmitInfo struct out of IUtilities

---
 include/nbl/video/utilities/IUtilities.h      | 186 +----------------
 .../nbl/video/utilities/SIntendedSubmitInfo.h | 195 ++++++++++++++++++
 src/nbl/video/utilities/IUtilities.cpp        |  42 +---
 3 files changed, 198 insertions(+), 225 deletions(-)
 create mode 100644 include/nbl/video/utilities/SIntendedSubmitInfo.h

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 55af9a3750..32baac4b95 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -7,11 +7,9 @@
 #include "nbl/asset/asset.h"
 #include "nbl/asset/utils/ISPIRVOptimizer.h"
 
-#include "nbl/video/IGPUBuffer.h"
-#include "nbl/video/IGPUImage.h"
-#include "nbl/video/ILogicalDevice.h"
 #include "nbl/video/IPhysicalDevice.h"
 #include "nbl/video/alloc/StreamingTransientDataBuffer.h"
+#include "nbl/video/utilities/SIntendedSubmitInfo.h"
 #include "nbl/video/utilities/CPropertyPoolHandler.h"
 #include "nbl/video/utilities/CScanner.h"
 #include "nbl/video/utilities/CComputeBlit.h"
@@ -194,188 +192,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             ));
             return allocationSize;
         }
-        
-        //! Struct meant to be used with any Utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour.
-        //! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. 
-        //! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` !
-        //!     for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied.
-        struct SIntendedSubmitInfo final
-        {
-            public:
-                inline bool valid() const
-                {
-                    if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty())
-                        return false;
-                    const auto* scratch = frontHalf.getScratchCommandBuffer();
-                    // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened 
-                    if (!scratch->isResettable())
-                        return false;
-                    // It makes no sense to reuse the same commands for a second submission.
-                    // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which
-                    // frees have already been latched on the scratch semaphore you must signal anyway.
-                    if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
-                        return false;
-                    if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL)
-                        return false;
-                    return true;
-                }
-
-                inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};}
-
-                inline operator IQueue::SSubmitInfo() const
-                {
-                    return {
-                        .waitSemaphores = frontHalf.waitSemaphores,
-                        .commandBuffers = frontHalf.commandBuffers,
-                        .signalSemaphores = signalSemaphores
-                    };
-                }
-
-                // One thing you might notice is that this results in a few implicit Memory and Execution Dependencies
-                // So there's a little bit of non-deterministic behaviour we won't fight (will not insert a barrier every time you "could-have" overflown)
-                inline void overflowSubmit()
-                {
-                    auto cmdbuf = frontHalf.getScratchCommandBuffer();
-                    auto& scratchSemaphore = signalSemaphores.front();
-                    // but first sumbit the already buffered up copies
-                    cmdbuf->end();
-                    IQueue::SSubmitInfo submit = *this;
-                    // we only signal the last semaphore which is used as scratch
-                    submit.signalSemaphores = {&scratchSemaphore,1};
-                    assert(submit.isValid());
-                    frontHalf.queue->submit({&submit,1});
-                    // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller
-                    {
-                        const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++};
-                        const_cast<ILogicalDevice*>(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1});
-                    }
-                    // we've already waited on the Host for the semaphores, no use waiting twice
-                    frontHalf.waitSemaphores = {};
-                    // since all the commandbuffers have submitted already we only reuse the last one
-                    frontHalf.commandBuffers = {&frontHalf.commandBuffers.back(),1};
-                    // we will still signal the same set in the future
-                    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-                    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-                }
-
-
-                //! The last CommandBuffer will be used to record the copy commands    
-                struct SFrontHalf final
-                {
-                    //! Need a valid queue and all the command buffers except the last one should be in `EXECUTABLE` state.
-                    inline bool valid() const
-                    {
-                        if (!queue)
-                            return false;
-                        if (!commandBuffers.empty())
-                        for (size_t i=0; i<commandBuffers.size()-1; i++)
-                        if (commandBuffers[i].cmdbuf->getState()==IGPUCommandBuffer::STATE::EXECUTABLE)
-                            return false;
-                        return true;
-                    }
-
-                    //! Little class to hold the storage for the modified commandbuffer span until submission time.
-                    class CRAIISpanPatch final : core::Uncopyable
-                    {
-                        public:
-                            inline ~CRAIISpanPatch()
-                            {
-                                toNullify->commandBuffers = {};
-                            }
-                            inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));}
-                            inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs)
-                            {
-                                commandBuffersStorage = std::move(rhs.commandBuffersStorage);
-                                return *this;
-                            }
-
-                            inline operator bool() const {return m_recordingCommandBuffer.get();}
-
-                        private:
-                            friend SFrontHalf;
-                            inline CRAIISpanPatch() = default;
-                            inline CRAIISpanPatch(SFrontHalf* _toNull) : commandBuffersStorage(_toNull->commandBuffers.size()+1), toNullify(_toNull) {}
-
-                            core::vector<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffersStorage;
-                            // If we made a new commandbuffer we need to nullify the span so it doesn't point at stale mem
-                            SFrontHalf* toNullify = nullptr;
-                            // If new one made, then need to hold reference to it, else its just an extra ref, but whatever
-                            core::smart_refctd_ptr<IGPUCommandBuffer> m_recordingCommandBuffer;
-                    };
-                    //! Patches the `commandBuffers` and then makes sure the last command buffer is resettable, in recording state begun with ONE_TIME_SUBMIT
-                    //! If we can't make the last cmdbuffer that way, we make a new one and add it onto the end (hence the name "patching")
-                    //! If `commandBuffers.empty()`, it will create an implicit command buffer to use for recording commands,
-                    //! else if the last command buffer is not feasible to use as scratch for whatever reason,
-                    //! it will add another temporary command buffer to end of `commandBuffers` and use it for recording.
-                    //! WARNING: If patching occurs:
-                    //!     - a submission must occur before the return value goes out of scope!
-                    //!     - if `!commandBuffers.empty()`, the last CommandBuffer won't be in the same state as it was before entering the function,
-                    //!         because it needs to be `end()`ed before the submission
-                    //!     - the destructor of the return value will clear `commandBuffers` span
-                    //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
-                    [[nodiscard("The RAII object returned by `patch()` provides lifetimes to your spans!")]]
-                    inline CRAIISpanPatch patch()
-                    {
-                        if (auto* candidateScratch = getScratchCommandBuffer(); candidateScratch && candidateScratch->isResettable())
-                        switch(candidateScratch->getState())
-                        {
-                            case IGPUCommandBuffer::STATE::INITIAL:
-                                if (!candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
-                                    break;
-                                [[fallthrough]];
-                            case IGPUCommandBuffer::STATE::RECORDING:
-                                if (!candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
-                                    break;
-                                {
-                                    CRAIISpanPatch retval;
-                                    retval.m_recordingCommandBuffer = core::smart_refctd_ptr<IGPUCommandBuffer>(candidateScratch);
-                                    return retval;
-                                }
-                                break;
-                            default:
-                                break;
-                        }
-
-                        CRAIISpanPatch retval(this);
-                        std::copy(commandBuffers.begin(),commandBuffers.end(),retval.commandBuffersStorage.begin());
-                        {
-                            auto pool = const_cast<ILogicalDevice*>(queue->getOriginDevice())->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-                            if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&retval.m_recordingCommandBuffer,1}))
-                                return {};
-                            if (!retval.m_recordingCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
-                                return {};
-                            retval.commandBuffersStorage.back().cmdbuf = retval.m_recordingCommandBuffer.get();
-                        }
-                        commandBuffers = retval.commandBuffersStorage;
-                        return retval;
-                    }
-
-                    // Use the last command buffer in intendedNextSubmit, it should be in recording state
-                    inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
-                    inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
-
-                    // This parameter is required but may be unused if there is no need to submit
-                    IQueue* queue = {};
-                    // Use this parameter to wait for previous operations to finish before whatever commands the Utility you're using records
-                    std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
-                    // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit,
-                    // for example baked command buffers with pipeline barrier commands.
-                    // Also remember that even though the last CommandBuffer is scratch, it you can record commands into it as well.
-                    std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
-                } frontHalf = {};
-                //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount!
-                //! You can actually examine the change in `signalSemaphore.front().value` to figure out how many overflows occurred.
-                //! This semaphore is needed to "stitch together" additional submits if they occur so they occur before and after the original intended waits and signals.
-                //! We use the first semaphore to keep the intended order of original semaphore signal and waits unchanged no matter how many overflows occur.
-                //! You do however, NEED TO KEEP IT in the signal set of the last submit you're supposed to do manually, this allows freeing any resources used
-                //! after the submit is done, indicating that your streaming routine is done.  
-                //! * Also use this parameter to signal new semaphores so that other submits know your Utility method is done.
-                std::span<IQueue::SSubmitInfo::SSemaphoreInfo> signalSemaphores = {};
-
-            private:
-                friend class IUtilities;
-                static const char* ErrorText;
-        };
 
 
         //! This method lets you wrap any other function following the "submit on overflow" pattern with the final submission
diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h
new file mode 100644
index 0000000000..19a33d927b
--- /dev/null
+++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h
@@ -0,0 +1,195 @@
+#ifndef _NBL_VIDEO_S_INTENDED_SUBMIT_INFO_H_INCLUDED_
+#define _NBL_VIDEO_S_INTENDED_SUBMIT_INFO_H_INCLUDED_
+
+
+#include "nbl/video/IGPUCommandBuffer.h"
+
+
+namespace nbl::video
+{
+    
+//! Struct meant to be used with any Utility (not just `IUtilities`) which exhibits "submit on overflow" behaviour.
+//! Such functions are non-blocking (unless overflow) and take `SIntendedSubmitInfo` by reference and patch it accordingly. 
+//! MAKE SURE to do a submit to `queue` by yourself with a submit info obtained by casting `this` to `IQueue::SSubmitInfo` !
+//!     for example: in the case the `frontHalf.waitSemaphores` were already waited upon, the struct will be modified to have it's `waitSemaphores` emptied.
+struct SIntendedSubmitInfo final
+{
+    public:
+        inline bool valid() const
+        {
+            if (!frontHalf.valid() || frontHalf.commandBuffers.empty() || signalSemaphores.empty())
+                return false;
+            const auto* scratch = frontHalf.getScratchCommandBuffer();
+            // Must be resettable so we can end, submit, wait, reset and continue recording commands into it as-if nothing happened 
+            if (!scratch->isResettable())
+                return false;
+            // It makes no sense to reuse the same commands for a second submission.
+            // Moreover its dangerous because the utilities record their own internal commands which might use subresources for which
+            // frees have already been latched on the scratch semaphore you must signal anyway.
+            if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                return false;
+            if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL)
+                return false;
+            return true;
+        }
+
+        inline ISemaphore::SWaitInfo getScratchSemaphoreNextWait() const {return {signalSemaphores.front().semaphore,signalSemaphores.front().value};}
+
+        inline operator IQueue::SSubmitInfo() const
+        {
+            return {
+                .waitSemaphores = frontHalf.waitSemaphores,
+                .commandBuffers = frontHalf.commandBuffers,
+                .signalSemaphores = signalSemaphores
+            };
+        }
+
+        // One thing you might notice is that this results in a few implicit Memory and Execution Dependencies
+        // So there's a little bit of non-deterministic behaviour we won't fight (will not insert a barrier every time you "could-have" overflown)
+        inline void overflowSubmit()
+        {
+            auto cmdbuf = frontHalf.getScratchCommandBuffer();
+            auto& scratchSemaphore = signalSemaphores.front();
+            // but first sumbit the already buffered up copies
+            cmdbuf->end();
+            IQueue::SSubmitInfo submit = *this;
+            // we only signal the last semaphore which is used as scratch
+            submit.signalSemaphores = {&scratchSemaphore,1};
+            assert(submit.isValid());
+            frontHalf.queue->submit({&submit,1});
+            // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller
+            {
+                const ISemaphore::SWaitInfo info = {scratchSemaphore.semaphore,scratchSemaphore.value++};
+                const_cast<ILogicalDevice*>(cmdbuf->getOriginDevice())->blockForSemaphores({&info,1});
+            }
+            // we've already waited on the Host for the semaphores, no use waiting twice
+            frontHalf.waitSemaphores = {};
+            // since all the commandbuffers have submitted already we only reuse the last one
+            frontHalf.commandBuffers = {&frontHalf.commandBuffers.back(),1};
+            // we will still signal the same set in the future
+            cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+            cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        }
+
+
+        //! The last CommandBuffer will be used to record the copy commands    
+        struct SFrontHalf final
+        {
+            //! Need a valid queue and all the command buffers except the last one should be in `EXECUTABLE` state.
+            inline bool valid() const
+            {
+                if (!queue)
+                    return false;
+                if (!commandBuffers.empty())
+                for (size_t i=0; i<commandBuffers.size()-1; i++)
+                if (commandBuffers[i].cmdbuf->getState()==IGPUCommandBuffer::STATE::EXECUTABLE)
+                    return false;
+                return true;
+            }
+
+            //! Little class to hold the storage for the modified commandbuffer span until submission time.
+            class CRAIISpanPatch final : core::Uncopyable
+            {
+                public:
+                    inline ~CRAIISpanPatch()
+                    {
+                        toNullify->commandBuffers = {};
+                    }
+                    inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));}
+                    inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs)
+                    {
+                        commandBuffersStorage = std::move(rhs.commandBuffersStorage);
+                        return *this;
+                    }
+
+                    inline operator bool() const {return m_recordingCommandBuffer.get();}
+
+                private:
+                    friend SFrontHalf;
+                    inline CRAIISpanPatch() = default;
+                    inline CRAIISpanPatch(SFrontHalf* _toNull) : commandBuffersStorage(_toNull->commandBuffers.size()+1), toNullify(_toNull) {}
+
+                    core::vector<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffersStorage;
+                    // If we made a new commandbuffer we need to nullify the span so it doesn't point at stale mem
+                    SFrontHalf* toNullify = nullptr;
+                    // If new one made, then need to hold reference to it, else its just an extra ref, but whatever
+                    core::smart_refctd_ptr<IGPUCommandBuffer> m_recordingCommandBuffer;
+            };
+            //! Patches the `commandBuffers` and then makes sure the last command buffer is resettable, in recording state begun with ONE_TIME_SUBMIT
+            //! If we can't make the last cmdbuffer that way, we make a new one and add it onto the end (hence the name "patching")
+            //! If `commandBuffers.empty()`, it will create an implicit command buffer to use for recording commands,
+            //! else if the last command buffer is not feasible to use as scratch for whatever reason,
+            //! it will add another temporary command buffer to end of `commandBuffers` and use it for recording.
+            //! WARNING: If patching occurs:
+            //!     - a submission must occur before the return value goes out of scope!
+            //!     - if `!commandBuffers.empty()`, the last CommandBuffer won't be in the same state as it was before entering the function,
+            //!         because it needs to be `end()`ed before the submission
+            //!     - the destructor of the return value will clear `commandBuffers` span
+            //! For more info on command buffer states See `ICommandBuffer::E_STATE` comments.
+            [[nodiscard("The RAII object returned by `patch()` provides lifetimes to your spans!")]]
+            inline CRAIISpanPatch patch()
+            {
+                if (auto* candidateScratch = getScratchCommandBuffer(); candidateScratch && candidateScratch->isResettable())
+                switch(candidateScratch->getState())
+                {
+                    case IGPUCommandBuffer::STATE::INITIAL:
+                        if (!candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                            break;
+                        [[fallthrough]];
+                    case IGPUCommandBuffer::STATE::RECORDING:
+                        if (!candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                            break;
+                        {
+                            CRAIISpanPatch retval;
+                            retval.m_recordingCommandBuffer = core::smart_refctd_ptr<IGPUCommandBuffer>(candidateScratch);
+                            return retval;
+                        }
+                        break;
+                    default:
+                        break;
+                }
+
+                CRAIISpanPatch retval(this);
+                std::copy(commandBuffers.begin(),commandBuffers.end(),retval.commandBuffersStorage.begin());
+                {
+                    auto pool = const_cast<ILogicalDevice*>(queue->getOriginDevice())->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+                    if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&retval.m_recordingCommandBuffer,1}))
+                        return {};
+                    if (!retval.m_recordingCommandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
+                        return {};
+                    retval.commandBuffersStorage.back().cmdbuf = retval.m_recordingCommandBuffer.get();
+                }
+                commandBuffers = retval.commandBuffersStorage;
+                return retval;
+            }
+
+            // Use the last command buffer in intendedNextSubmit, it should be in recording state
+            inline IGPUCommandBuffer* getScratchCommandBuffer() {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
+            inline const IGPUCommandBuffer* getScratchCommandBuffer() const {return commandBuffers.empty() ? nullptr:commandBuffers.back().cmdbuf;}
+
+            // This parameter is required but may be unused if there is no need to submit
+            IQueue* queue = {};
+            // Use this parameter to wait for previous operations to finish before whatever commands the Utility you're using records
+            std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores = {};
+            // Fill the commandbuffers you want to run before the first command the Utility records to run in the same submit,
+            // for example baked command buffers with pipeline barrier commands.
+            // Also remember that even though the last CommandBuffer is scratch, it you can record commands into it as well.
+            std::span<IQueue::SSubmitInfo::SCommandBufferInfo> commandBuffers = {};
+        } frontHalf = {};
+        //! The first Semaphore will be used as a scratch, so don't choose the values for waits and signals yourself as we can advance the counter an arbitrary amount!
+        //! You can actually examine the change in `signalSemaphore.front().value` to figure out how many overflows occurred.
+        //! This semaphore is needed to "stitch together" additional submits if they occur so they occur before and after the original intended waits and signals.
+        //! We use the first semaphore to keep the intended order of original semaphore signal and waits unchanged no matter how many overflows occur.
+        //! You do however, NEED TO KEEP IT in the signal set of the last submit you're supposed to do manually, this allows freeing any resources used
+        //! after the submit is done, indicating that your streaming routine is done.  
+        //! * Also use this parameter to signal new semaphores so that other submits know your Utility method is done.
+        std::span<IQueue::SSubmitInfo::SSemaphoreInfo> signalSemaphores = {};
+
+    private:
+        friend class IUtilities;
+        static const char* ErrorText;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/video/utilities/IUtilities.cpp b/src/nbl/video/utilities/IUtilities.cpp
index 5ad7612f1e..397f7021a2 100644
--- a/src/nbl/video/utilities/IUtilities.cpp
+++ b/src/nbl/video/utilities/IUtilities.cpp
@@ -4,7 +4,7 @@
 
 namespace nbl::video
 {
-const char* IUtilities::SIntendedSubmitInfo::ErrorText = R"===(Invalid `IUtilities::SIntendedSubmitInfo`, possible reasons are:
+const char* SIntendedSubmitInfo::ErrorText = R"===(Invalid `IUtilities::SIntendedSubmitInfo`, possible reasons are:
 - No `commandBuffers` or `signalSemaphores` given in respective spans
 - `commandBuffer.back()` is not Resettable
 - `commandBuffer.back()` is not already begun with ONE_TIME_SUBMIT_BIT
@@ -168,44 +168,7 @@ IQueue::SSubmitInfo IUtilities::updateImageViaStagingBuffer(
     }
     return intendedNextSubmit;
 }
-
-void IUtilities::updateImageViaStagingBufferAutoSubmit(
-    asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, asset::IImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,
-    IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo submitInfo
-)
-{
-    if(!submitInfo.isValid())
-    {
-        m_logger.log("submitInfo is invalid.", nbl::system::ILogger::ELL_ERROR);
-        assert(false);
-        return;
-    }
-
-    CSubmitInfoPatcher submitInfoPatcher;
-    submitInfoPatcher.patchAndBegin(submitInfo, m_device, submissionQueue->getFamilyIndex());
-    submitInfo = updateImageViaStagingBuffer(srcBuffer,srcFormat,dstImage,currentDstImageLayout,regions,submissionQueue,submissionFence,submitInfo);
-    submitInfoPatcher.end();
-
-    assert(submitInfo.isValid());
-    submissionQueue->submit(1u,&submitInfo,submissionFence);
-}
-
-void IUtilities::updateImageViaStagingBufferAutoSubmit(
-    asset::ICPUBuffer const* srcBuffer, asset::E_FORMAT srcFormat, video::IGPUImage* dstImage, asset::IImage::LAYOUT currentDstImageLayout, const core::SRange<const asset::IImage::SBufferCopy>& regions,
-    IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo
-)
-{
-    if(!submitInfo.isValid())
-    {
-        m_logger.log("submitInfo is invalid.", nbl::system::ILogger::ELL_ERROR);
-        assert(false);
-        return;
-    }
-
-    auto fence = m_device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
-    updateImageViaStagingBufferAutoSubmit(srcBuffer,srcFormat,dstImage,currentDstImageLayout,regions,submissionQueue,fence.get(),submitInfo);
-    m_device->blockForFences(1u,&fence.get());
-}
+#endif
 
 ImageRegionIterator::ImageRegionIterator(
     const core::SRange<const asset::IImage::SBufferCopy>& copyRegions,
@@ -762,6 +725,5 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
     else
         return false;
 }
-#endif
 
 } // namespace nbl::video
\ No newline at end of file

From 3160a464ab34b85714c356c4596007f9b40a6dc3 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Fri, 12 Jan 2024 23:07:19 +0100
Subject: [PATCH 45/62] going to sleep, next TODO is to implement the
 IUtilities::downloadBuffer methods

---
 include/nbl/video/utilities/SIntendedSubmitInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h
index 19a33d927b..d6f99edcd1 100644
--- a/include/nbl/video/utilities/SIntendedSubmitInfo.h
+++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h
@@ -55,7 +55,7 @@ struct SIntendedSubmitInfo final
             IQueue::SSubmitInfo submit = *this;
             // we only signal the last semaphore which is used as scratch
             submit.signalSemaphores = {&scratchSemaphore,1};
-            assert(submit.isValid());
+            assert(submit.valid());
             frontHalf.queue->submit({&submit,1});
             // We wait (stall) on the immediately preceeding submission timeline semaphore signal value and increase it for the next signaller
             {

From 8670d421dc1e90ef730532dad7f9a373c6a0725a Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sat, 13 Jan 2024 22:02:14 +0100
Subject: [PATCH 46/62] outline the TODO for @theoreticalphysicsftw

---
 include/nbl/ext/ScreenShot/ScreenShot.h | 166 +-----------------------
 1 file changed, 3 insertions(+), 163 deletions(-)

diff --git a/include/nbl/ext/ScreenShot/ScreenShot.h b/include/nbl/ext/ScreenShot/ScreenShot.h
index 536c060ddb..4e04cf1e84 100644
--- a/include/nbl/ext/ScreenShot/ScreenShot.h
+++ b/include/nbl/ext/ScreenShot/ScreenShot.h
@@ -15,9 +15,10 @@ using namespace nbl::video;
 	Create a ScreenShot with gpu image usage and save it to a file.
 	The queue being passed must have TRANSFER capability.
 
-	TODO: Add support for downloading a region of a specific subresource
+	TODO (Mihailo): Add support for downloading a region of a specific subresource
 */
 
+#if 0 // TODO (Mihailo): port
 inline core::smart_refctd_ptr<ICPUImageView> createScreenShot(
 	ILogicalDevice* logicalDevice,
 	IQueue* queue,
@@ -199,169 +200,8 @@ inline bool createScreenShot(
 	IAssetWriter::SAssetWriteParams writeParams(cpuImageView.get());
 	return assetManager->writeAsset(filename.string(),writeParams); // TODO: Use std::filesystem::path
 }
-
-} // namespace nbl::ext::ScreenShot
-
 #endif
 
-#ifdef OLD_CODE // code from old `ditt` branch:
-			/*
-				Download mip level image with gpu image usage and save it to IGPUBuffer.
-				Because of the fence placed by driver the function stalls the CPU 
-				to wait on the GPU to finish, beware of that.
-				@see IDriverFence
-			*/
-
-			//! TODO: HANDLE UNPACK ALIGNMENT
-			[[nodiscard]] core::smart_refctd_ptr<IDriverFence> downloadImageMipLevel(IDriver* driver, IGPUImage* source, IGPUBuffer* destination, uint32_t sourceMipLevel = 0u, size_t destOffset = 0ull, bool implicitflush = true)
-			{
-				// will change this, https://github.com/buildaworldnet/IrrlichtBAW/issues/148
-				if (isBlockCompressionFormat(source->getCreationParameters().format))
-					return nullptr;
-
-				auto extent = source->getMipSize(sourceMipLevel);
-				IGPUImage::SBufferCopy pRegions[1u] = { {destOffset,extent.x,extent.y,{static_cast<IImage::E_ASPECT_FLAGS>(0u),sourceMipLevel,0u,1u},{0u,0u,0u},{extent.x,extent.y,extent.z}} };
-				driver->copyImageToBuffer(source, destination, 1u, pRegions);
-
-				return driver->placeFence(implicitflush);
-			}
-
-			/*
-				Create a ScreenShot with gpu image usage and save it to a file.
-			*/
-			bool createScreenShot(IVideoDriver* driver, IAssetManager* assetManager, const IGPUImageView* gpuImageView, const std::string& outFileName, E_FORMAT convertToFormat=EF_UNKNOWN)
-			{
-				auto fetchedImageViewParmas = gpuImageView->getCreationParameters();
-				auto gpuImage = fetchedImageViewParmas.image;
-				auto fetchedImageParams = gpuImage->getCreationParameters();
-				auto image = ICPUImage::create(std::move(fetchedImageParams));
-
-				auto texelBufferRowLength = IImageAssetHandlerBase::calcPitchInBlocks(fetchedImageParams.extent.width * getBlockDimensions(fetchedImageParams.format).X, getTexelOrBlockBytesize(fetchedImageParams.format));
-
-				auto regions = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ICPUImage::SBufferCopy>>(1u);
-				ICPUImage::SBufferCopy& region = regions->front();
-
-				region.imageSubresource.mipLevel = 0u;
-				region.imageSubresource.baseArrayLayer = 0u;
-				region.imageSubresource.layerCount = 1u;
-				region.bufferOffset = 0u;
-				region.bufferRowLength = texelBufferRowLength;
-				region.bufferImageHeight = 0u;
-				region.imageOffset = { 0u, 0u, 0u };
-				region.imageExtent = image->getCreationParameters().extent;
-
-				IDeviceMemoryBacked::SDeviceMemoryRequirements memoryRequirements;
-				memoryRequirements.vulkanReqs.alignment = 64u;
-				memoryRequirements.vulkanReqs.memoryTypeBits = 0xffffffffu;
-				memoryRequirements.memoryHeapLocation = IDeviceMemoryAllocation::ESMT_NOT_DEVICE_LOCAL;
-				memoryRequirements.mappingCapability = IDeviceMemoryAllocation::EMCF_CAN_MAP_FOR_READ | IDeviceMemoryAllocation::EMCF_COHERENT | IDeviceMemoryAllocation::EMCF_CACHED;
-				memoryRequirements.vulkanReqs.size = image->getImageDataSizeInBytes();
-				auto destinationBuffer = driver->createGPUBufferOnDedMem(memoryRequirements);
-
-				auto mapPointerGetterFence = downloadImageMipLevel(driver, gpuImage.get(), destinationBuffer.get());
-
-				auto destinationBoundMemory = destinationBuffer->getBoundMemory();
-				destinationBoundMemory->mapMemoryRange(IDeviceMemoryAllocation::EMCAF_READ, { 0u, memoryRequirements.vulkanReqs.size });
-
-				auto correctedScreenShotTexelBuffer = core::make_smart_refctd_ptr<ICPUBuffer>(memoryRequirements.vulkanReqs.size);
-				bool flipImage = true;
-				if(flipImage)
-				{
-					auto extent = gpuImage->getMipSize(0u);
-					uint32_t rowByteSize = extent.x * getTexelOrBlockBytesize(gpuImage->getCreationParameters().format);
-					for(uint32_t y = 0; y < extent.y; ++y)
-					{
-						uint32_t flipped_y = extent.y - y - 1;
-						memcpy(reinterpret_cast<uint8_t*>(correctedScreenShotTexelBuffer->getPointer()) + rowByteSize * y, reinterpret_cast<uint8_t*>(destinationBoundMemory->getMappedPointer()) + rowByteSize * flipped_y, rowByteSize);
-					}
-				}
-				else
-				{
-					memcpy(correctedScreenShotTexelBuffer->getPointer(), destinationBoundMemory->getMappedPointer(), memoryRequirements.vulkanReqs.size);
-				}
-
-				destinationBoundMemory->unmapMemory();
-
-				image->setBufferAndRegions(std::move(correctedScreenShotTexelBuffer), regions);
-				
-				while (mapPointerGetterFence->waitCPU(1000ull, mapPointerGetterFence->canDeferredFlush()) == EDFR_TIMEOUT_EXPIRED) {}
-
-				core::smart_refctd_ptr<ICPUImage> convertedImage;
-				if (convertToFormat != EF_UNKNOWN)
-				{
-					auto referenceImageParams = image->getCreationParameters();
-					auto referenceBuffer = image->getBuffer();
-					auto referenceRegions = image->getRegions();
-					auto referenceRegion = referenceRegions.begin();
-					const auto newTexelOrBlockByteSize = getTexelOrBlockBytesize(convertToFormat);
-
-					auto newImageParams = referenceImageParams;
-					auto newCpuBuffer = core::make_smart_refctd_ptr<ICPUBuffer>(referenceBuffer->getSize() * newTexelOrBlockByteSize);
-					auto newRegions = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ICPUImage::SBufferCopy>>(referenceRegions.size());
-
-					for (auto newRegion = newRegions->begin(); newRegion != newRegions->end(); ++newRegion)
-					{
-						*newRegion = *(referenceRegion++);
-						newRegion->bufferOffset = newRegion->bufferOffset * newTexelOrBlockByteSize;
-					}
-
-					newImageParams.format = convertToFormat;
-					convertedImage = ICPUImage::create(std::move(newImageParams));
-					convertedImage->setBufferAndRegions(std::move(newCpuBuffer), newRegions);
-
-					//CConvertFormatImageFilter TODO: use this one instead with a nice dither @Anastazluk, we could also get rid of a lot of code here, since there's a bunch of constraints
-					CSwizzleAndConvertImageFilter<> convertFilter;
-					CSwizzleAndConvertImageFilter<>::state_type state;
-
-					state.swizzle = {};
-					state.inImage = image.get();
-					state.outImage = convertedImage.get();
-					state.inOffset = { 0, 0, 0 };
-					state.inBaseLayer = 0;
-					state.outOffset = { 0, 0, 0 };
-					state.outBaseLayer = 0;
-					//state.dither = ;
-
-					for (auto itr = 0; itr < convertedImage->getCreationParameters().mipLevels; ++itr)
-					{
-						auto regionWithMipMap = convertedImage->getRegions(itr).begin();
-
-						state.extent = regionWithMipMap->getExtent();
-						state.layerCount = regionWithMipMap->imageSubresource.layerCount;
-						state.inMipLevel = regionWithMipMap->imageSubresource.mipLevel;
-						state.outMipLevel = regionWithMipMap->imageSubresource.mipLevel;
-
-						const bool ok = convertFilter.execute(core::execution::par_unseq,&state);
-						assert(ok);
-					}
-				}
-				else
-					convertedImage = image;
-				auto newCreationParams = convertedImage->getCreationParameters();
-				
-				ICPUImageView::SCreationParams viewParams;
-				viewParams.flags = static_cast<ICPUImageView::E_CREATE_FLAGS>(0u);
-				viewParams.image = convertedImage;
-				viewParams.format = newCreationParams.format;
-				viewParams.viewType = ICPUImageView::ET_2D;
-				viewParams.subresourceRange.baseArrayLayer = 0u;
-				viewParams.subresourceRange.layerCount = newCreationParams.arrayLayers;
-				viewParams.subresourceRange.baseMipLevel = 0u;
-				viewParams.subresourceRange.levelCount = newCreationParams.mipLevels;
-
-				auto imageView = ICPUImageView::create(std::move(viewParams));
-
-				auto tryToWrite = [&](IAsset* asset)
-				{
-					IAssetWriter::SAssetWriteParams wparams(asset);
-					return assetManager->writeAsset(outFileName, wparams);
-				};
-
-				bool status = tryToWrite(convertedImage.get());
-				if (!status)
-					status = tryToWrite(imageView.get());
-
-				return status;
+} // namespace nbl::ext::ScreenShot
 
-			}
 #endif
\ No newline at end of file

From 2d86373fc40a9c6f7874b5664b7178f182c9b78c Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 14 Jan 2024 00:02:31 +0300
Subject: [PATCH 47/62] fix debugmessenger not being created

---
 src/nbl/video/CVulkanConnection.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nbl/video/CVulkanConnection.cpp b/src/nbl/video/CVulkanConnection.cpp
index 3c24dea895..dfdcf0510e 100644
--- a/src/nbl/video/CVulkanConnection.cpp
+++ b/src/nbl/video/CVulkanConnection.cpp
@@ -234,7 +234,7 @@ core::smart_refctd_ptr<CVulkanConnection> CVulkanConnection::create(core::smart_
     std::unique_ptr<CVulkanDebugCallback> debugCallback = std::make_unique<CVulkanDebugCallback>(std::move(logger));
 
     VkDebugUtilsMessengerCreateInfoEXT debugMessengerCreateInfo = { VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, nullptr };
-    if (logger && enabledFeatures.debugUtils)
+    if (enabledFeatures.debugUtils)
     {
         debugMessengerCreateInfo.flags = 0;
         auto debugCallbackFlags = getDebugCallbackFlagsFromLogLevelMask(logLevelMask);
@@ -321,4 +321,4 @@ CVulkanConnection::~CVulkanConnection()
     vkDestroyInstance(m_vkInstance,nullptr);
 }
 
-}
\ No newline at end of file
+}

From ca2593ce2a7d9f7586b5e057173ebe7967df097f Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sat, 13 Jan 2024 22:49:35 +0100
Subject: [PATCH 48/62] fix a validation error

---
 src/nbl/video/CVulkanPhysicalDevice.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp
index 030890b187..ecfdaa6f42 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.cpp
+++ b/src/nbl/video/CVulkanPhysicalDevice.cpp
@@ -1410,10 +1410,11 @@ core::smart_refctd_ptr<ILogicalDevice> CVulkanPhysicalDevice::createLogicalDevic
 
 
         // extensions
-        REQUIRE_EXTENSION_IF(enabledFeatures.swapchainMode.hasFlags(E_SWAPCHAIN_MODE::ESM_SURFACE),VK_KHR_SWAPCHAIN_EXTENSION_NAME,nullptr);
+        const bool swapchainEnabled = enabledFeatures.swapchainMode.hasFlags(E_SWAPCHAIN_MODE::ESM_SURFACE);
+        REQUIRE_EXTENSION_IF(swapchainEnabled,VK_KHR_SWAPCHAIN_EXTENSION_NAME,nullptr);
         {
             // If we reach here then the instance extension VK_KHR_Surface was definitely enabled otherwise the extension wouldn't be reported by physical device
-            REQUIRE_EXTENSION_IF(true,VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME);
+            REQUIRE_EXTENSION_IF(swapchainEnabled,VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME);
             // TODO: https://github.com/Devsh-Graphics-Programming/Nabla/issues/508
         }
 

From 461cb4af0b09cb44306fcd571d3d6fd14231e1ae Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sat, 13 Jan 2024 23:19:58 +0100
Subject: [PATCH 49/62] rework pipeline barriers and events to use std::spans

---
 include/nbl/video/IGPUCommandBuffer.h  | 13 ++--
 src/nbl/video/CVulkanCommandBuffer.cpp | 89 +++++++++++++-------------
 src/nbl/video/CVulkanCommandBuffer.h   |  2 +-
 src/nbl/video/IGPUCommandBuffer.cpp    | 63 +++++++++---------
 4 files changed, 82 insertions(+), 85 deletions(-)

diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index fb50d2a7e5..1fd61a19c8 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -146,18 +146,15 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             using image_barrier_t = SImageMemoryBarrier<ResourceBarrier>;
 
             // no dependency flags because they must be 0 per the spec
-            uint32_t memBarrierCount = 0;
-            const asset::SMemoryBarrier* memBarriers = nullptr;
-            uint32_t bufBarrierCount = 0;
-            const buffer_barrier_t* bufBarriers = nullptr;
-            uint32_t imgBarrierCount = 0;
-            const image_barrier_t* imgBarriers = nullptr;
+            std::span<const asset::SMemoryBarrier> memBarriers = {};
+            std::span<const buffer_barrier_t> bufBarriers = {};
+            std::span<const image_barrier_t> imgBarriers = {};
         };
 
         using SEventDependencyInfo = SDependencyInfo<asset::SMemoryBarrier>;
         bool setEvent(IEvent* const _event, const SEventDependencyInfo& depInfo);
         bool resetEvent(IEvent* _event, const core::bitflag<stage_flags_t> stageMask);
-        bool waitEvents(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos);
+        bool waitEvents(const std::span<IEvent*> events, const SEventDependencyInfo* depInfos);
         
         struct SOwnershipTransferBarrier
         {
@@ -539,7 +536,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 
         virtual bool setEvent_impl(IEvent* const _event, const SEventDependencyInfo& depInfo) = 0;
         virtual bool resetEvent_impl(IEvent* const _event, const core::bitflag<stage_flags_t> stageMask) = 0;
-        virtual bool waitEvents_impl(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos) = 0;
+        virtual bool waitEvents_impl(const std::span<IEvent*> events, const SEventDependencyInfo* depInfos) = 0;
         virtual bool pipelineBarrier_impl(const core::bitflag<asset::E_DEPENDENCY_FLAGS> dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) = 0;
 
         virtual bool fillBuffer_impl(const asset::SBufferRange<IGPUBuffer>& range, const uint32_t data) = 0;
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index 188ca33595..2b1f9d9070 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -98,57 +98,57 @@ static inline auto getVkImageSubresourceFrom(const SubresourceRange& range) -> s
 
 template<typename ResourceBarrier>
 VkDependencyInfoKHR fill(
-    VkMemoryBarrier2* memoryBarriers, VkBufferMemoryBarrier2* bufferBarriers, VkImageMemoryBarrier2* imageBarriers,
+    VkMemoryBarrier2* const memoryBarriers, VkBufferMemoryBarrier2* const bufferBarriers, VkImageMemoryBarrier2* const imageBarriers,
     const IGPUCommandBuffer::SDependencyInfo<ResourceBarrier>& depInfo, const uint32_t selfQueueFamilyIndex=IQueue::FamilyIgnored
 ) {
     VkDependencyInfoKHR info = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR,nullptr };
-    for (auto i=0; i<depInfo.memBarrierCount; i++)
+    auto outMem = memoryBarriers;
+    for (const auto& in : depInfo.memBarriers)
     {
-        auto& out = memoryBarriers[i];
-        out.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2_KHR;
-        out.pNext = nullptr;
-        fill(out,depInfo.memBarriers[i],selfQueueFamilyIndex);
+        outMem->sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2_KHR;
+        outMem->pNext = nullptr;
+        fill(*(outMem++),in,selfQueueFamilyIndex);
     }
-    for (auto i=0; i<depInfo.bufBarrierCount; i++)
+    auto outBuf = bufferBarriers;
+    for (const auto& in : depInfo.bufBarriers)
     {
-        auto& out = bufferBarriers[i];
-        out.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2_KHR;
-        out.pNext = nullptr; // VkExternalMemoryAcquireUnmodifiedEXT
-
-        const auto& in = depInfo.bufBarriers[i];
-        fill(out,in.barrier,selfQueueFamilyIndex,in.range.buffer->getCachedCreationParams().isConcurrentSharing());
-        out.buffer = static_cast<const CVulkanBuffer*>(in.range.buffer.get())->getInternalObject();
-        out.offset = in.range.offset;
-        out.size = in.range.size;
+        outBuf->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2_KHR;
+        outBuf->pNext = nullptr; // VkExternalMemoryAcquireUnmodifiedEXT
+
+        fill(*outBuf,in.barrier,selfQueueFamilyIndex,in.range.buffer->getCachedCreationParams().isConcurrentSharing());
+        outBuf->buffer = static_cast<const CVulkanBuffer*>(in.range.buffer.get())->getInternalObject();
+        outBuf->offset = in.range.offset;
+        outBuf->size = in.range.size;
+        outBuf++;
     }
-    for (auto i=0; i<depInfo.imgBarrierCount; i++)
+    auto outImg = imageBarriers;
+    for (const auto& in : depInfo.imgBarriers)
     {
-        auto& out = imageBarriers[i];
-        out.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR;
-        out.pNext = nullptr; // VkExternalMemoryAcquireUnmodifiedEXT or VkSampleLocationsInfoEXT
-
-        const auto& in = depInfo.imgBarriers[i];
-        out.oldLayout = getVkImageLayoutFromImageLayout(in.oldLayout);
-        out.newLayout = getVkImageLayoutFromImageLayout(in.newLayout);
-        fill(out,in.barrier,selfQueueFamilyIndex,in.image->getCachedCreationParams().isConcurrentSharing());
-        out.image = static_cast<const CVulkanImage*>(in.image)->getInternalObject();
-        out.subresourceRange = getVkImageSubresourceFrom(in.subresourceRange);
+        outImg->sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR;
+        outImg->pNext = nullptr; // VkExternalMemoryAcquireUnmodifiedEXT or VkSampleLocationsInfoEXT
+
+        outImg->oldLayout = getVkImageLayoutFromImageLayout(in.oldLayout);
+        outImg->newLayout = getVkImageLayoutFromImageLayout(in.newLayout);
+        fill(*outImg,in.barrier,selfQueueFamilyIndex,in.image->getCachedCreationParams().isConcurrentSharing());
+        outImg->image = static_cast<const CVulkanImage*>(in.image)->getInternalObject();
+        outImg->subresourceRange = getVkImageSubresourceFrom(in.subresourceRange);
+        outImg++;
     }
     info.dependencyFlags = 0u;
-    info.memoryBarrierCount = depInfo.memBarrierCount;
+    info.memoryBarrierCount = depInfo.memBarriers.size();
     info.pMemoryBarriers = memoryBarriers;
-    info.bufferMemoryBarrierCount = depInfo.bufBarrierCount;
+    info.bufferMemoryBarrierCount = depInfo.bufBarriers.size();
     info.pBufferMemoryBarriers = bufferBarriers;
-    info.imageMemoryBarrierCount = depInfo.imgBarrierCount;
+    info.imageMemoryBarrierCount = depInfo.imgBarriers.size();
     info.pImageMemoryBarriers = imageBarriers;
     return info;
 }
 
 bool CVulkanCommandBuffer::setEvent_impl(IEvent* const _event, const SEventDependencyInfo& depInfo)
 {
-    IGPUCommandPool::StackAllocation<VkMemoryBarrier2KHR> memoryBarriers(m_cmdpool,depInfo.memBarrierCount);
-    IGPUCommandPool::StackAllocation<VkBufferMemoryBarrier2KHR> bufferBarriers(m_cmdpool,depInfo.bufBarrierCount);
-    IGPUCommandPool::StackAllocation<VkImageMemoryBarrier2KHR> imageBarriers(m_cmdpool,depInfo.imgBarrierCount);
+    IGPUCommandPool::StackAllocation<VkMemoryBarrier2KHR> memoryBarriers(m_cmdpool,depInfo.memBarriers.size());
+    IGPUCommandPool::StackAllocation<VkBufferMemoryBarrier2KHR> bufferBarriers(m_cmdpool,depInfo.bufBarriers.size());
+    IGPUCommandPool::StackAllocation<VkImageMemoryBarrier2KHR> imageBarriers(m_cmdpool,depInfo.imgBarriers.size());
     if (!memoryBarriers || !bufferBarriers || !imageBarriers)
         return false;
 
@@ -163,11 +163,12 @@ bool CVulkanCommandBuffer::resetEvent_impl(IEvent* const _event, const core::bit
     return true;
 }
 
-bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos)
+bool CVulkanCommandBuffer::waitEvents_impl(const std::span<IEvent*> events, const SEventDependencyInfo* depInfos)
 {
-    IGPUCommandPool::StackAllocation<VkEvent> events(m_cmdpool,eventCount);
+    const uint32_t eventCount = events.size();
+    IGPUCommandPool::StackAllocation<VkEvent> vk_events(m_cmdpool,eventCount);
     IGPUCommandPool::StackAllocation<VkDependencyInfoKHR> infos(m_cmdpool,eventCount);
-    if (!events || !infos)
+    if (!vk_events || !infos)
         return false;
 
     uint32_t memBarrierCount = 0u;
@@ -175,9 +176,9 @@ bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* co
     uint32_t imgBarrierCount = 0u;
     for (auto i=0u; i<eventCount; i++)
     {
-        memBarrierCount += depInfos[i].memBarrierCount;
-        bufBarrierCount += depInfos[i].bufBarrierCount;
-        imgBarrierCount += depInfos[i].imgBarrierCount;
+        memBarrierCount += depInfos[i].memBarriers.size();
+        bufBarrierCount += depInfos[i].bufBarriers.size();
+        imgBarrierCount += depInfos[i].imgBarriers.size();
     }
     IGPUCommandPool::StackAllocation<VkMemoryBarrier2KHR> memoryBarriers(m_cmdpool,memBarrierCount);
     IGPUCommandPool::StackAllocation<VkBufferMemoryBarrier2KHR> bufferBarriers(m_cmdpool,bufBarrierCount);
@@ -190,21 +191,21 @@ bool CVulkanCommandBuffer::waitEvents_impl(const uint32_t eventCount, IEvent* co
     imgBarrierCount = 0u;
     for (auto i=0u; i<eventCount; i++)
     {
-        events[i] = static_cast<CVulkanEvent*>(pEvents[i])->getInternalObject();
+        vk_events[i] = static_cast<CVulkanEvent*>(events[i])->getInternalObject();
         infos[i] = fill(memoryBarriers.data()+memBarrierCount,bufferBarriers.data()+bufBarrierCount,imageBarriers.data()+imgBarrierCount,depInfos[i]);
         memBarrierCount += infos[i].memoryBarrierCount;
         bufBarrierCount += infos[i].bufferMemoryBarrierCount;
         imgBarrierCount += infos[i].imageMemoryBarrierCount;
     }
-    getFunctionTable().vkCmdWaitEvents2(m_cmdbuf,eventCount,events.data(),infos.data());
+    getFunctionTable().vkCmdWaitEvents2(m_cmdbuf,eventCount,vk_events.data(),infos.data());
     return true;
 }
 
 bool CVulkanCommandBuffer::pipelineBarrier_impl(const core::bitflag<asset::E_DEPENDENCY_FLAGS> dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo)
 {
-    IGPUCommandPool::StackAllocation<VkMemoryBarrier2KHR> memoryBarriers(m_cmdpool,depInfo.memBarrierCount);
-    IGPUCommandPool::StackAllocation<VkBufferMemoryBarrier2KHR> bufferBarriers(m_cmdpool,depInfo.bufBarrierCount);
-    IGPUCommandPool::StackAllocation<VkImageMemoryBarrier2KHR> imageBarriers(m_cmdpool,depInfo.imgBarrierCount);
+    IGPUCommandPool::StackAllocation<VkMemoryBarrier2KHR> memoryBarriers(m_cmdpool,depInfo.memBarriers.size());
+    IGPUCommandPool::StackAllocation<VkBufferMemoryBarrier2KHR> bufferBarriers(m_cmdpool,depInfo.bufBarriers.size());
+    IGPUCommandPool::StackAllocation<VkImageMemoryBarrier2KHR> imageBarriers(m_cmdpool,depInfo.imgBarriers.size());
     if (!memoryBarriers || !bufferBarriers || !imageBarriers)
         return false;
 
diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h
index 3d18c7db0b..5e39300f48 100644
--- a/src/nbl/video/CVulkanCommandBuffer.h
+++ b/src/nbl/video/CVulkanCommandBuffer.h
@@ -48,7 +48,7 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer
 
         bool setEvent_impl(IEvent* const _event, const SEventDependencyInfo& depInfo) override;
         bool resetEvent_impl(IEvent* const _event, const core::bitflag<stage_flags_t> stageMask) override;
-        bool waitEvents_impl(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos) override;
+        bool waitEvents_impl(const std::span<IEvent*> events, const SEventDependencyInfo* depInfos) override;
         bool pipelineBarrier_impl(const core::bitflag<asset::E_DEPENDENCY_FLAGS> dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo) override;
 
         bool fillBuffer_impl(const asset::SBufferRange<IGPUBuffer>& range, const uint32_t data) override;
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 0f890ffdb8..6d2a3449c0 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -195,18 +195,18 @@ bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<ResourceBarrier>
     // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-None-07891
     // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-None-07892
     // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkBufferMemoryBarrier2-srcStageMask-03851
-    for (auto j=0u; j<depInfo.memBarrierCount; j++)
-    if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),depInfo.memBarriers[j]))
+    for (const auto& barrier : depInfo.memBarriers)
+    if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),barrier))
         return true;
-    for (auto j=0u; j<depInfo.bufBarrierCount; j++)
+    for (const auto& barrier : depInfo.bufBarriers)
     {
         // AFAIK, no special constraints on alignment or usage here
-        if (invalidBufferRange(depInfo.bufBarriers[j].range,1u,IGPUBuffer::EUF_NONE))
-        if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),depInfo.bufBarriers[j]))
+        if (invalidBufferRange(barrier.range,1u,IGPUBuffer::EUF_NONE))
+        if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),barrier))
             return true;
     }
-    for (auto j=0u; j<depInfo.imgBarrierCount; j++)
-    if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),depInfo.imgBarriers[j]))
+    for (const auto& barrier : depInfo.imgBarriers)
+    if (!device->validateMemoryBarrier(m_cmdpool->getQueueFamilyIndex(),barrier))
         return true;
     #endif // _NBL_DEBUG
     return false;
@@ -261,19 +261,19 @@ bool IGPUCommandBuffer::resetEvent(IEvent* _event, const core::bitflag<stage_fla
     return resetEvent_impl(_event,stageMask);
 }
 
-bool IGPUCommandBuffer::waitEvents(const uint32_t eventCount, IEvent* const* const pEvents, const SEventDependencyInfo* depInfos)
+bool IGPUCommandBuffer::waitEvents(const std::span<IEvent*> events, const SEventDependencyInfo* depInfos)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT,RENDERPASS_SCOPE::BOTH))
         return false;
 
-    if (eventCount==0u)
+    if (events.empty())
         return false;
 
     uint32_t totalBufferCount = 0u;
     uint32_t totalImageCount = 0u;
-    for (auto i=0u; i<eventCount; ++i)
+    for (auto i=0u; i<events.size(); ++i)
     {
-        if (!pEvents[i] || !this->isCompatibleDevicewise(pEvents[i]))
+        if (!events[i] || !this->isCompatibleDevicewise(events[i]))
             return false;
 
         const auto& depInfo = depInfos[i];
@@ -283,24 +283,24 @@ bool IGPUCommandBuffer::waitEvents(const uint32_t eventCount, IEvent* const* con
         if (invalidDependency(depInfo))
             return false;
 
-        totalBufferCount += depInfo.bufBarrierCount;
-        totalImageCount += depInfo.imgBarrierCount;
+        totalBufferCount += depInfo.bufBarriers.size();
+        totalImageCount += depInfo.imgBarriers.size();
     }
 
-    auto* cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CWaitEventsCmd>(m_commandList,eventCount,pEvents,totalBufferCount,totalImageCount);
+    auto* cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CWaitEventsCmd>(m_commandList,events.size(),events.data(),totalBufferCount,totalImageCount);
     if (!cmd)
         return false;
 
     auto outIt = cmd->getDeviceMemoryBacked();
-    for (auto i=0u; i<eventCount; ++i)
+    for (auto i=0u; i<events.size(); ++i)
     {
         const auto& depInfo = depInfos[i];
-        for (auto j=0u; j<depInfo.bufBarrierCount; j++)
-            *(outIt++) = depInfo.bufBarriers[j].range.buffer;
-        for (auto j=0u; j<depInfo.imgBarrierCount; j++)
-            *(outIt++) = core::smart_refctd_ptr<const IGPUImage>(depInfo.imgBarriers[j].image);
+        for (const auto& barrier : depInfo.bufBarriers)
+            *(outIt++) = barrier.range.buffer;
+        for (const auto& barrier : depInfo.imgBarriers)
+            *(outIt++) = core::smart_refctd_ptr<const IGPUImage>(barrier.image);
     }
-    return waitEvents_impl(eventCount,pEvents,depInfos);
+    return waitEvents_impl(events,depInfos);
 }
 
 bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_FLAGS> dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo)
@@ -308,7 +308,7 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_
     if (!checkStateBeforeRecording(~queue_flags_t::NONE))
         return false;
 
-    if (depInfo.memBarrierCount==0u && depInfo.bufBarrierCount==0u && depInfo.imgBarrierCount==0u)
+    if (depInfo.memBarriers.empty() && depInfo.bufBarriers.empty() && depInfo.imgBarriers.empty())
         return false;
 
     if (invalidDependency(depInfo))
@@ -318,7 +318,7 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_
     if (withinSubpass)
     {
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-bufferMemoryBarrierCount-01178
-        if (depInfo.bufBarrierCount)
+        if (!depInfo.bufBarriers.empty())
             return false;
 
         auto invalidSubpassMemoryBarrier = [dependencyFlags](const asset::SMemoryBarrier& barrier) -> bool
@@ -338,15 +338,14 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_
                 return true;
             return false;
         };
-        for (auto i=0u; i<depInfo.memBarrierCount; i++)
+        for (const auto& barrier : depInfo.memBarriers)
         {
-            if (invalidSubpassMemoryBarrier(depInfo.memBarriers[i]))
+            if (invalidSubpassMemoryBarrier(barrier))
                 return false;
         }
-        for (auto i=0u; i<depInfo.imgBarrierCount; i++)
+        for (const auto& barrier : depInfo.imgBarriers)
         {
-            const auto& barrier = depInfo.imgBarriers[i];
-            if (invalidSubpassMemoryBarrier(depInfo.memBarriers[i]))
+            if (invalidSubpassMemoryBarrier(barrier.barrier.dep))
                 return false;
 
             // TODO: under NBL_DEBUG, cause waay too expensive to validate
@@ -370,15 +369,15 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_
     else if (dependencyFlags.hasFlags(asset::EDF_VIEW_LOCAL_BIT))
         return false;
 
-    auto* cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CPipelineBarrierCmd>(m_commandList,depInfo.bufBarrierCount,depInfo.imgBarrierCount);
+    auto* cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CPipelineBarrierCmd>(m_commandList,depInfo.bufBarriers.size(),depInfo.imgBarriers.size());
     if (!cmd)
         return false;
 
     auto outIt = cmd->getVariableCountResources();
-    for (auto j=0u; j<depInfo.bufBarrierCount; j++)
-        *(outIt++) = depInfo.bufBarriers[j].range.buffer;
-    for (auto j=0u; j<depInfo.imgBarrierCount; j++)
-        *(outIt++) = core::smart_refctd_ptr<const IGPUImage>(depInfo.imgBarriers[j].image);
+    for (const auto& barrier : depInfo.bufBarriers)
+        *(outIt++) = barrier.range.buffer;
+    for (const auto& barrier : depInfo.imgBarriers)
+        *(outIt++) = core::smart_refctd_ptr<const IGPUImage>(barrier.image);
     return pipelineBarrier_impl(dependencyFlags,depInfo);
 }
 

From d96fd1d8db18b140b3afb8a9b7cf7e4ed1824a68 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sun, 14 Jan 2024 00:16:54 +0100
Subject: [PATCH 50/62] Port `downloadBufferRangeViaStagingBuffer

---
 include/nbl/video/utilities/IUtilities.h | 105 +++++++----------------
 1 file changed, 32 insertions(+), 73 deletions(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 32baac4b95..ed5927d00d 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -298,16 +298,15 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 uint32_t localOffset = StreamingTransientDataBufferMT<>::invalid_value;
                 m_defaultUploadBuffer.get()->multi_allocate(std::chrono::steady_clock::now()+std::chrono::microseconds(500u),1u,&localOffset,&allocationSize,&m_allocationAlignment);
                 // copy only the unpadded part
-                if (localOffset != StreamingTransientDataBufferMT<>::invalid_value)
+                if (localOffset!=StreamingTransientDataBufferMT<>::invalid_value)
                 {
                     const void* dataPtr = reinterpret_cast<const uint8_t*>(data) + uploadedSize;
                     memcpy(reinterpret_cast<uint8_t*>(m_defaultUploadBuffer->getBufferPointer()) + localOffset, dataPtr, subSize);
                 }
-                // keep trying again
-                if (localOffset == StreamingTransientDataBufferMT<>::invalid_value)
+                else
                 {
                     nextSubmit.overflowSubmit();
-                    continue;
+                    continue; // keep trying again
                 }
                 // some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
                 if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate())
@@ -402,9 +401,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                 StreamingTransientDataBufferMT<>* m_downstreamingBuffer;
                 const size_t m_dstOffset;
         };
-#if 0 // TODO: port
+
         //! Calls the callback to copy the data to a destination Offset
-        //! * IMPORTANT: To make the copies ready, IUtility::getDefaultDownStreamingBuffer()->cull_frees() should be called after the `submissionFence` is signaled.
+        //! * IMPORTANT: To make all the callbacks execute, IUtility::getDefaultDownStreamingBuffer()->cull_frees() should be called after the `nextSubmit.signalSemaphores.front()` is signaled.
         //! If the allocation from staging memory fails due to large image size or fragmentation then This function may need to submit the command buffer via the `submissionQueue` and then signal the fence. 
         //! Returns:
         //!     IQueue::SSubmitInfo to use for command buffer submission instead of `intendedNextSubmit`. 
@@ -438,50 +437,44 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //!     * submissionQueue must point to a valid IQueue
         //!     * submissionFence must point to a valid IGPUFence
         //!     * submissionFence must be in `UNSIGNALED` state
-        [[nodiscard("Use The New IQueue::SubmitInfo")]] inline IQueue::SSubmitInfo downloadBufferRangeViaStagingBuffer(
-            const std::function<data_consumption_callback_t>& consumeCallback, const asset::SBufferRange<IGPUBuffer>& srcBufferRange,
-            IQueue* submissionQueue, IGPUFence* submissionFence, IQueue::SSubmitInfo intendedNextSubmit = {}
-        )
+        inline bool downloadBufferRangeViaStagingBuffer(const std::function<data_consumption_callback_t>& consumeCallback, SIntendedSubmitInfo& nextSubmit, const asset::SBufferRange<IGPUBuffer>& srcBufferRange)
         {
-            if (!intendedNextSubmit.isValid() || intendedNextSubmit.commandBufferCount <= 0u)
+            if (!srcBufferRange.isValid() || !srcBufferRange.buffer->getCreationParams().usage.hasFlags(asset::IBuffer::EUF_TRANSFER_SRC_BIT))
             {
-                // TODO: log error -> intendedNextSubmit is invalid
-                assert(false);
-                return intendedNextSubmit;
+                m_logger.log("Invalid `srcBufferRange` or buffer has no `EUF_TRANSFER_SRC_BIT` usage flag, cannot `downloadBufferRangeViaStagingBuffer`!",system::ILogger::ELL_ERROR);
+                return false;
             }
 
-            // Use the last command buffer in intendedNextSubmit, it should be in recording state
-            auto& cmdbuf = intendedNextSubmit.commandBuffers[intendedNextSubmit.commandBufferCount - 1];
-
-            assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING && cmdbuf->isResettable());
-            assert(cmdbuf->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT));
+            if (!nextSubmit.valid())
+            {
+                m_logger.log(nextSubmit.ErrorText, system::ILogger::ELL_ERROR);
+                return false;
+            }
 
             const auto& limits = m_device->getPhysicalDevice()->getLimits();
             const uint32_t optimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t);
 
-            auto* cmdpool = cmdbuf->getPool();
-            assert(cmdpool->getQueueFamilyIndex() == submissionQueue->getFamilyIndex());
- 
+            auto cmdbuf = nextSubmit.frontHalf.getScratchCommandBuffer();
             // Basically downloadedSize is downloadRecordedIntoCommandBufferSize :D
-            for (size_t downloadedSize = 0ull; downloadedSize < srcBufferRange.size;)
+            for (size_t downloadedSize=0ull; downloadedSize<srcBufferRange.size;)
             {
                 const size_t notDownloadedSize = srcBufferRange.size - downloadedSize;
                 // how large we can make the allocation
-                uint32_t maxFreeBlock = m_defaultDownloadBuffer.get()->max_size();
+                const uint32_t maxFreeBlock = m_defaultDownloadBuffer->max_size();
                 // get allocation size
-                const uint32_t allocationSize = getAllocationSizeForStreamingBuffer(notDownloadedSize, m_allocationAlignment, maxFreeBlock, optimalTransferAtom);
-                const uint32_t copySize = core::min(allocationSize, notDownloadedSize);
+                const uint32_t allocationSize = getAllocationSizeForStreamingBuffer(notDownloadedSize,m_allocationAlignment,maxFreeBlock,optimalTransferAtom);
+                const uint32_t copySize = core::min(allocationSize,notDownloadedSize);
 
                 uint32_t localOffset = StreamingTransientDataBufferMT<>::invalid_value;
                 m_defaultDownloadBuffer.get()->multi_allocate(std::chrono::steady_clock::now()+std::chrono::microseconds(500u),1u,&localOffset,&allocationSize,&m_allocationAlignment);
                 
-                if (localOffset != StreamingTransientDataBufferMT<>::invalid_value)
+                if (localOffset!=StreamingTransientDataBufferMT<>::invalid_value)
                 {
                     IGPUCommandBuffer::SBufferCopy copy;
                     copy.srcOffset = srcBufferRange.offset + downloadedSize;
                     copy.dstOffset = localOffset;
                     copy.size = copySize;
-                    cmdbuf->copyBuffer(srcBufferRange.buffer.get(), m_defaultDownloadBuffer.get()->getBuffer(), 1u, &copy);
+                    cmdbuf->copyBuffer(srcBufferRange.buffer.get(),m_defaultDownloadBuffer->getBuffer(),1u,&copy);
 
                     auto dataConsumer = core::make_smart_refctd_ptr<CDownstreamingDataConsumer>(
                         IDeviceMemoryAllocation::MemoryRange(localOffset,copySize),
@@ -490,63 +483,29 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
                         m_defaultDownloadBuffer.get(),
                         downloadedSize
                     );
-                    m_defaultDownloadBuffer.get()->multi_deallocate(1u, &localOffset, &allocationSize, core::smart_refctd_ptr<IGPUFence>(submissionFence), &dataConsumer.get());
+                    m_defaultDownloadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getScratchSemaphoreNextWait(),&dataConsumer.get());
 
                     downloadedSize += copySize;
                 }
-                else
-                {
-                    // but first sumbit the already buffered up copies
-                    cmdbuf->end();
-                    IQueue::SSubmitInfo submit = intendedNextSubmit;
-                    submit.signalSemaphoreCount = 0u;
-                    submit.pSignalSemaphores = nullptr;
-                    assert(submit.isValid());
-                    submissionQueue->submit(1u, &submit, submissionFence);
-                    m_device->blockForFences(1u, &submissionFence);
-
-                    intendedNextSubmit.commandBufferCount = 1u;
-                    intendedNextSubmit.commandBuffers = &cmdbuf;
-                    intendedNextSubmit.waitSemaphoreCount = 0u;
-                    intendedNextSubmit.pWaitSemaphores = nullptr;
-                    intendedNextSubmit.pWaitDstStageMask = nullptr;
-
-                    // before resetting we need poll all events in the allocator's deferred free list
-                    m_defaultDownloadBuffer->cull_frees();
-                    // we can reset the fence and commandbuffer because we fully wait for the GPU to finish here
-                    m_device->resetFences(1u, &submissionFence);
-                    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-                    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-                }
+                else // but first sumbit the already buffered up copies
+                    nextSubmit.overflowSubmit();
             }
-            return intendedNextSubmit;
+            return true;
         }
 
         //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above.
         //! Additionally waits for the fence
         //! WARNING: This function blocks CPU and stalls the GPU!
-        inline void downloadBufferRangeViaStagingBufferAutoSubmit(
-            const asset::SBufferRange<IGPUBuffer>& srcBufferRange, void* data,
-            IQueue* submissionQueue, const IQueue::SSubmitInfo& submitInfo = {}
-        )
+        inline bool downloadBufferRangeViaStagingBufferAutoSubmit(const SIntendedSubmitInfo::SFrontHalf& submit,const asset::SBufferRange<IGPUBuffer>& srcBufferRange, void* data)
         {
-            if (!submitInfo.isValid())
-            {
-                // TODO: log error
-                assert(false);
-                return;
-            }
-            
-
-            auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
-            downloadBufferRangeViaStagingBufferAutoSubmit(std::function<data_consumption_callback_t>(default_data_consumption_callback_t(data)), srcBufferRange, submissionQueue, fence.get(), submitInfo);
-            auto* fenceptr = fence.get();
-            m_device->blockForFences(1u, &fenceptr);
+            if (!autoSubmitAndBlock(submit,[&](SIntendedSubmitInfo& nextSubmit){return downloadBufferRangeViaStagingBuffer(default_data_consumption_callback_t(data),nextSubmit,srcBufferRange);}))
+                return false;
 
-            //! TODO: NOTE this method cannot be turned into a pure autoSubmitAndBlock + lambda because there's stuff to do AFTER the semaphore wait~! 
-            m_defaultDownloadBuffer->cull_frees(); // its while(poll()) {} now IIRC
+            //! NOTE this method cannot be turned into a pure autoSubmitAndBlock + lambda because there's stuff to do AFTER the semaphore wait~! 
+            m_defaultDownloadBuffer->cull_frees();
+            return true;
         }
-#endif        
+
         // --------------
         // buildAccelerationStructures
         // --------------

From 2d2acc9e7affb5676e1e7b2635bfe044d10a3a24 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sun, 14 Jan 2024 00:54:23 +0100
Subject: [PATCH 51/62] fix bug in CRAIISpanPatch

---
 include/nbl/video/utilities/IUtilities.h          | 2 +-
 include/nbl/video/utilities/SIntendedSubmitInfo.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index ed5927d00d..492a1db027 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -496,7 +496,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //! This function is an specialization of the `downloadBufferRangeViaStagingBufferAutoSubmit` function above.
         //! Additionally waits for the fence
         //! WARNING: This function blocks CPU and stalls the GPU!
-        inline bool downloadBufferRangeViaStagingBufferAutoSubmit(const SIntendedSubmitInfo::SFrontHalf& submit,const asset::SBufferRange<IGPUBuffer>& srcBufferRange, void* data)
+        inline bool downloadBufferRangeViaStagingBufferAutoSubmit(const SIntendedSubmitInfo::SFrontHalf& submit, const asset::SBufferRange<IGPUBuffer>& srcBufferRange, void* data)
         {
             if (!autoSubmitAndBlock(submit,[&](SIntendedSubmitInfo& nextSubmit){return downloadBufferRangeViaStagingBuffer(default_data_consumption_callback_t(data),nextSubmit,srcBufferRange);}))
                 return false;
diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h
index d6f99edcd1..654d73e324 100644
--- a/include/nbl/video/utilities/SIntendedSubmitInfo.h
+++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h
@@ -99,6 +99,8 @@ struct SIntendedSubmitInfo final
                     inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs)
                     {
                         commandBuffersStorage = std::move(rhs.commandBuffersStorage);
+                        std::swap(toNullify,rhs.toNullify);
+                        std::swap(m_recordingCommandBuffer,rhs.m_recordingCommandBuffer);
                         return *this;
                     }
 

From 60c1c3916b6e5f209b7597dc7995e3628b9933f1 Mon Sep 17 00:00:00 2001
From: devsh <devsh.graphicsprogramming@gmail.com>
Date: Sun, 14 Jan 2024 01:08:03 +0100
Subject: [PATCH 52/62] Ported Example 23, and fixed a few bugs here and there

---
 include/nbl/video/IGPUCommandBuffer.h             | 2 +-
 include/nbl/video/utilities/SIntendedSubmitInfo.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index 1fd61a19c8..f9fd1b5225 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -667,7 +667,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             BOTH = OUTSIDE|INSIDE
         };
         using queue_flags_t = IQueue::FAMILY_FLAGS;
-        bool checkStateBeforeRecording(const core::bitflag<queue_flags_t> allowedQueueFlags=queue_flags_t::NONE, const core::bitflag<RENDERPASS_SCOPE> renderpassScope=RENDERPASS_SCOPE::BOTH);
+        bool checkStateBeforeRecording(const core::bitflag<queue_flags_t> allowedQueueFlags=~queue_flags_t::NONE, const core::bitflag<RENDERPASS_SCOPE> renderpassScope=RENDERPASS_SCOPE::BOTH);
 
         template<typename ResourceBarrier>
         bool invalidDependency(const SDependencyInfo<ResourceBarrier>& depInfo) const;
diff --git a/include/nbl/video/utilities/SIntendedSubmitInfo.h b/include/nbl/video/utilities/SIntendedSubmitInfo.h
index 654d73e324..5ad8369943 100644
--- a/include/nbl/video/utilities/SIntendedSubmitInfo.h
+++ b/include/nbl/video/utilities/SIntendedSubmitInfo.h
@@ -28,7 +28,7 @@ struct SIntendedSubmitInfo final
             // frees have already been latched on the scratch semaphore you must signal anyway.
             if (!scratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
                 return false;
-            if (scratch->getState()!=IGPUCommandBuffer::STATE::INITIAL)
+            if (scratch->getState()!=IGPUCommandBuffer::STATE::RECORDING)
                 return false;
             return true;
         }
@@ -93,7 +93,8 @@ struct SIntendedSubmitInfo final
                 public:
                     inline ~CRAIISpanPatch()
                     {
-                        toNullify->commandBuffers = {};
+                        if (toNullify)
+                            toNullify->commandBuffers = {};
                     }
                     inline CRAIISpanPatch(CRAIISpanPatch&& other) : CRAIISpanPatch() {operator=(std::move(other));}
                     inline CRAIISpanPatch& operator=(CRAIISpanPatch&& rhs)

From 3faf1fb4d3d5322fef0b5e9643513956f4f623ee Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 13 Jan 2024 22:33:00 +0300
Subject: [PATCH 53/62] merge conflicts

---
 include/nbl/video/CCUDASharedMemory.h    | 4 ----
 include/nbl/video/utilities/IUtilities.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h
index 9b3e4a0551..1ae9f32ff6 100644
--- a/include/nbl/video/CCUDASharedMemory.h
+++ b/include/nbl/video/CCUDASharedMemory.h
@@ -20,10 +20,6 @@
 namespace nbl::video
 {
 
-class CCUDAMemoryMapping: public core::IReferenceCounted
-{
-};
-
 class CCUDASharedMemory : public core::IReferenceCounted
 {
 public:
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 492a1db027..983c2ab277 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -234,7 +234,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //! WARNING: This function blocks CPU and stalls the GPU!
         inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function<bool(SIntendedSubmitInfo&)>& what)
         {            
-            auto semaphore = m_device->createSemaphore(0);
+            auto semaphore = m_device->createSemaphore(ISemaphore::SCreationParams{.initialValue=0});
             // so we begin latching everything on the value of 1, but if we overflow it increases
             IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1};
 

From fd4f7333bf12332ebc5f76fd7f0feb9f01df0d73 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 14 Jan 2024 03:26:13 +0300
Subject: [PATCH 54/62] add missing external resource property queries

---
 include/nbl/asset/IBuffer.h                 |   2 +
 include/nbl/video/IDeviceMemoryAllocation.h |  24 ----
 include/nbl/video/IPhysicalDevice.h         | 139 ++++++++++++++++++++
 src/nbl/video/CVulkanLogicalDevice.cpp      |  35 ++++-
 src/nbl/video/CVulkanPhysicalDevice.h       |  69 ++++++++++
 5 files changed, 242 insertions(+), 27 deletions(-)

diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h
index e11d8faf7d..d50a415e69 100644
--- a/include/nbl/asset/IBuffer.h
+++ b/include/nbl/asset/IBuffer.h
@@ -42,6 +42,8 @@ class IBuffer : public core::IBuffer, public IDescriptor
 			//! synthetic Nabla inventions
 			// whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer
 			EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u,
+			
+			EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/
         };
 
 		//!
diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h
index 64529858ec..28ad0dcfa3 100644
--- a/include/nbl/video/IDeviceMemoryAllocation.h
+++ b/include/nbl/video/IDeviceMemoryAllocation.h
@@ -83,30 +83,6 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
             EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100,
         };
 
-        /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */
-        struct SExternalMemoryProperties
-        {
-            uint32_t exportableTypes : 7 = ~0u;
-            uint32_t compatibleTypes : 7 = ~0u;
-            uint32_t dedicatedOnly : 1 = 0u;
-            uint32_t exportable : 1 = ~0u;
-            uint32_t importable : 1 = ~0u;
-
-            bool operator == (SExternalMemoryProperties const& rhs) const = default;
-
-            SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const
-            {
-                rhs.exportableTypes &= exportableTypes;
-                rhs.compatibleTypes &= compatibleTypes;
-                rhs.dedicatedOnly |= dedicatedOnly;
-                rhs.exportable &= exportable;
-                rhs.importable &= importable;
-                return rhs;
-            }
-        };
-
-        static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t));
-
         //
         const ILogicalDevice* getOriginDevice() const {return m_originDevice;}
 
diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h
index 583c8ac9d0..30459e1667 100644
--- a/include/nbl/video/IPhysicalDevice.h
+++ b/include/nbl/video/IPhysicalDevice.h
@@ -26,8 +26,54 @@
 namespace nbl::video
 {
 
+
+
+
 class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
 {
+    template<class F> static constexpr bool is_bitflag = false;
+    template<class F> static constexpr bool is_bitflag<core::bitflag<F>> = true;
+
+    template<class T> struct RequestMapTraits;
+    template<class R, class...Args>struct RequestMapTraits<R(IPhysicalDevice::*)(Args...) const> : RequestMapTraits<R(IPhysicalDevice::*)(Args...)> {};
+    template<class R, class...Args> struct RequestMapTraits<R(IPhysicalDevice::*)(Args...)>
+    {
+        using Key = std::tuple<std::remove_cvref_t<Args>...>;
+        struct Hasher
+        {
+            template<int N = sizeof...(Args)>
+            static size_t hash(size_t seed, Key const& key)
+            {
+                if constexpr (0 == N)
+                    return seed;
+                else 
+                {
+                    using cur = std::remove_cvref_t<decltype(std::get<N - 1>(key))>;
+       
+                    if constexpr (is_bitflag<cur>)
+                        core::hash_combine(seed, cur::UNDERLYING_TYPE(std::get<N - 1>(key).value));
+                    else if constexpr (std::is_convertible_v<cur, size_t>)
+                        core::hash_combine(seed, size_t(std::get<N - 1>(key)));
+                    else
+                        core::hash_combine(seed, std::get<N - 1>(key));
+                    
+                    return hash<N - 1>(seed, key);
+                }
+
+            }
+
+            size_t operator()(Key const& key) const
+            {
+                return hash(0, key);
+            }
+        };
+
+        using Map = std::unordered_map<Key, R, Hasher>;
+    };
+
+    template<class T>
+    using RequestMap = typename RequestMapTraits<T>::Map;
+
     public:
         //
         virtual E_API_TYPE getAPIType() const = 0;
@@ -242,6 +288,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
 
             !! Same goes for `vkGetPhysicalDeviceSparseImageFormatProperties2`
         */
+
         struct SFormatBufferUsages
         {
             struct SUsage
@@ -687,6 +734,81 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
             return createLogicalDevice_impl(std::move(params));
         }
 
+
+        /* ExternalMemoryProperties *//* provided by VK_KHR_external_memory_capabilities */
+        struct SExternalMemoryProperties
+        {
+            uint32_t exportableTypes : 7 = ~0u;
+            uint32_t compatibleTypes : 7 = ~0u;
+            uint32_t dedicatedOnly : 1 = 0u;
+            uint32_t exportable : 1 = ~0u;
+            uint32_t importable : 1 = ~0u;
+
+            bool operator == (SExternalMemoryProperties const& rhs) const = default;
+
+            SExternalMemoryProperties operator &(SExternalMemoryProperties rhs) const
+            {
+                rhs.exportableTypes &= exportableTypes;
+                rhs.compatibleTypes &= compatibleTypes;
+                rhs.dedicatedOnly |= dedicatedOnly;
+                rhs.exportable &= exportable;
+                rhs.importable &= importable;
+                return rhs;
+            }
+        };
+
+        static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t));
+
+        struct SImageFormatProperties
+        {
+            VkExtent3D maxExtent = {};
+            uint32_t maxMipLevels = {};
+            uint32_t maxArrayLayers = {};
+            IGPUImage::E_SAMPLE_COUNT_FLAGS sampleCounts = IGPUImage::ESCF_1_BIT;
+            uint64_t maxResourceSize = 0;
+
+            bool operator == (SImageFormatProperties const& rhs) const = default;
+        };
+
+        struct SExternalImageFormatProperties : SImageFormatProperties, SExternalMemoryProperties
+        {
+        };
+
+        SExternalMemoryProperties getExternalBufferProperties(
+            core::bitflag<IGPUBuffer::E_USAGE_FLAGS> usage, 
+            IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const
+        {
+            usage &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags
+            {
+                std::shared_lock lock(m_externalBufferPropertiesMutex);
+                auto it = m_externalBufferProperties.find({ usage, handleType });
+                if (it != m_externalBufferProperties.end())
+                    return it->second;
+            }
+
+            std::unique_lock lock(m_externalBufferPropertiesMutex);
+            return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType);
+        }
+
+        SExternalImageFormatProperties getExternalImageProperties(
+            asset::E_FORMAT format, 
+            IGPUImage::TILING tiling, 
+            core::bitflag<IGPUImage::E_USAGE_FLAGS> usage, 
+            core::bitflag<IGPUImage::E_CREATE_FLAGS> flags, 
+            IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const
+        {
+            auto key = std::tuple{ format, tiling, usage, flags, handleType };
+            {
+                std::shared_lock lock(m_externalImagePropertiesMutex);
+                auto it = m_externalImageProperties.find(key);
+                if (it != m_externalImageProperties.end())
+                    return it->second;
+            }
+
+            std::unique_lock lock(m_externalImagePropertiesMutex);
+            return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, usage, flags, handleType);
+        }
+
     protected:
         struct SInitData final
         {
@@ -745,6 +867,23 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
                 return 220u; // largest from above
         }
 
+        // external memory 
+        /* ExternalBufferProperties *//* provided by VK_KHR_external_memory_capabilities */
+
+
+        virtual SExternalMemoryProperties getExternalBufferProperties_impl(core::bitflag<asset::IBuffer::E_USAGE_FLAGS> usage, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0;
+        mutable RequestMap<decltype(&getExternalBufferProperties_impl)> m_externalBufferProperties;
+        mutable std::shared_mutex m_externalBufferPropertiesMutex;
+
+        virtual SExternalImageFormatProperties getExternalImageProperties_impl(
+            asset::E_FORMAT format, 
+            IGPUImage::TILING tiling, 
+            core::bitflag<IGPUImage::E_USAGE_FLAGS> usage, 
+            core::bitflag<IGPUImage::E_CREATE_FLAGS> flags, 
+            IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0;
+        mutable RequestMap<decltype(&getExternalImageProperties_impl)> m_externalImageProperties;
+        mutable std::shared_mutex m_externalImagePropertiesMutex;
+
         // Format Promotion
         struct SBufferFormatPromotionRequestHash
         {
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 844bfc54cb..39b61d7c53 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -337,15 +337,44 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin
 core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams)
 {
     VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
-    // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR
-    vk_createInfo.pNext = nullptr;
+    // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkBufferDeviceAddressCreateInfoEXT, VkBufferOpaqueCaptureAddressCreateInfo, VkDedicatedAllocationBufferCreateInfoNV, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR
+
+    VkExternalMemoryBufferCreateInfo externalMemoryInfo = {
+       .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
+       .handleTypes = creationParams.externalHandleTypes.value,
+    };
+
+    const bool external = creationParams.externalHandleTypes.value;
+
+    vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr;
     vk_createInfo.flags = static_cast<VkBufferCreateFlags>(0u); // Nabla doesn't support any of these flags
     vk_createInfo.size = static_cast<VkDeviceSize>(creationParams.size);
     vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage);
-    vk_createInfo.sharingMode = creationParams.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE;
+    vk_createInfo.sharingMode = creationParams.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE;
     vk_createInfo.queueFamilyIndexCount = creationParams.queueFamilyIndexCount;
     vk_createInfo.pQueueFamilyIndices = creationParams.queueFamilyIndices;
 
+    bool dedicatedOnly = false;
+
+    if (external)
+    {
+        core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> requestedTypes = creationParams.externalHandleTypes;
+
+        while (const auto idx = hlsl::findLSB(static_cast<uint32_t>(requestedTypes.value)) + 1)
+        {
+            const auto handleType = static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(1u << (idx - 1));
+            requestedTypes ^= handleType;
+
+            auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType);
+
+            if (!core::bitflag(static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types
+                return nullptr;
+
+            // TODO: Handle this
+            dedicatedOnly = props.dedicatedOnly;
+        }
+    }
+
     VkBuffer vk_buffer;
     if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS)
         return nullptr;
diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h
index c1552c88f1..56069a3dd4 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.h
+++ b/src/nbl/video/CVulkanPhysicalDevice.h
@@ -109,6 +109,75 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice
             // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled
         }
 
+        inline static SExternalMemoryProperties mapExternalMemoryProps(VkExternalMemoryProperties const& props)
+        {
+            return {
+                .exportableTypes = props.exportFromImportedHandleTypes,
+                .compatibleTypes = props.compatibleHandleTypes,
+                .dedicatedOnly = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT ? 1u : 0u,
+                .exportable = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT ? 1u : 0u,
+                .importable = props.externalMemoryFeatures & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT ? 1u : 0u,
+            };
+        }
+
+        SExternalMemoryProperties getExternalBufferProperties_impl(core::bitflag<IGPUBuffer::E_USAGE_FLAGS> usage, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override
+        {
+            assert(!(handleType & (handleType - 1)));
+            VkPhysicalDeviceExternalBufferInfo info = {
+                .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO,
+                .usage = static_cast<VkBufferUsageFlags>(usage.value),
+                .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(handleType)
+            };
+            VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES };
+            vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps);
+            return mapExternalMemoryProps(externalProps.externalMemoryProperties);
+        }
+
+        SExternalImageFormatProperties getExternalImageProperties_impl(
+            asset::E_FORMAT format, 
+            IGPUImage::TILING tiling, 
+            core::bitflag<IGPUImage::E_USAGE_FLAGS> usage, 
+            core::bitflag<IGPUImage::E_CREATE_FLAGS> flags,  
+            IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override
+        {
+            assert(!(handleType & (handleType - 1)));
+
+            VkPhysicalDeviceExternalImageFormatInfo extInfo = {
+                .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
+                .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(handleType),
+            };
+
+            VkPhysicalDeviceImageFormatInfo2 info = {
+                .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+                .pNext = &extInfo,
+                .format = static_cast<VkFormat>(format),
+                .tiling = static_cast<VkImageTiling>(tiling),
+                .usage = usage.value,
+                .flags = flags.value,
+            };
+
+            VkExternalImageFormatProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES };
+
+            VkImageFormatProperties2 props = {
+                .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
+                .pNext = &externalProps,
+            };
+
+            vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props);
+
+            return 
+                { 
+                    {
+                        .maxExtent = props.imageFormatProperties.maxExtent,
+                        .maxMipLevels = props.imageFormatProperties.maxMipLevels,
+                        .maxArrayLayers = props.imageFormatProperties.maxArrayLayers,
+                        .sampleCounts = static_cast<IGPUImage::E_SAMPLE_COUNT_FLAGS>(props.imageFormatProperties.sampleCounts),
+                        .maxResourceSize = props.imageFormatProperties.maxResourceSize,
+                    }, 
+                    mapExternalMemoryProps(externalProps.externalMemoryProperties) 
+                };
+        }
+
         core::smart_refctd_ptr<ILogicalDevice> createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override;
 
     private:

From 5b1940cc22f1affe6a7deed2022d0a51966952a8 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 14 Jan 2024 04:40:31 +0300
Subject: [PATCH 55/62] add more stuff

---
 include/nbl/video/CCUDASharedMemory.h         |  2 +-
 include/nbl/video/CVulkanDeviceMemoryBacked.h |  6 +-
 include/nbl/video/IPhysicalDevice.h           |  6 +-
 src/nbl/video/CCUDASharedMemory.cpp           |  8 +--
 src/nbl/video/CVulkanBuffer.h                 |  2 +-
 src/nbl/video/CVulkanDeviceMemoryBacked.cpp   |  6 +-
 src/nbl/video/CVulkanLogicalDevice.cpp        | 60 +++++++++++++++----
 src/nbl/video/CVulkanPhysicalDevice.h         |  8 ++-
 8 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h
index 1ae9f32ff6..117a1ff4b5 100644
--- a/include/nbl/video/CCUDASharedMemory.h
+++ b/include/nbl/video/CCUDASharedMemory.h
@@ -49,7 +49,7 @@ class CCUDASharedMemory : public core::IReferenceCounted
 
     core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
 
-    core::smart_refctd_ptr<IGPUImage>  exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const;
+    core::smart_refctd_ptr<IGPUImage>  createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const;
 
 protected:
 
diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h
index c996000e04..2505de6865 100644
--- a/include/nbl/video/CVulkanDeviceMemoryBacked.h
+++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h
@@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface
 	protected:
 		// special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here)
 		CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle);
-		CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) :
-			CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {}
+		CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) :
+			CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {}
 
 	private:
-		static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle);
+		static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle);
 
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> m_memory = nullptr;
 		size_t m_offset = 0u;
diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h
index 30459e1667..e32a65d9f3 100644
--- a/include/nbl/video/IPhysicalDevice.h
+++ b/include/nbl/video/IPhysicalDevice.h
@@ -793,11 +793,12 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
         SExternalImageFormatProperties getExternalImageProperties(
             asset::E_FORMAT format, 
             IGPUImage::TILING tiling, 
+            IGPUImage::E_TYPE type,
             core::bitflag<IGPUImage::E_USAGE_FLAGS> usage, 
             core::bitflag<IGPUImage::E_CREATE_FLAGS> flags, 
             IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const
         {
-            auto key = std::tuple{ format, tiling, usage, flags, handleType };
+            auto key = std::tuple{ format, tiling, type, usage, flags, handleType };
             {
                 std::shared_lock lock(m_externalImagePropertiesMutex);
                 auto it = m_externalImageProperties.find(key);
@@ -806,7 +807,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
             }
 
             std::unique_lock lock(m_externalImagePropertiesMutex);
-            return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, usage, flags, handleType);
+            return m_externalImageProperties[key] = getExternalImageProperties_impl(format, tiling, type, usage, flags, handleType);
         }
 
     protected:
@@ -878,6 +879,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
         virtual SExternalImageFormatProperties getExternalImageProperties_impl(
             asset::E_FORMAT format, 
             IGPUImage::TILING tiling, 
+            IGPUImage::E_TYPE type,
             core::bitflag<IGPUImage::E_USAGE_FLAGS> usage, 
             core::bitflag<IGPUImage::E_CREATE_FLAGS> flags, 
             IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0;
diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp
index 3ebb8e211d..82d6f496cd 100644
--- a/src/nbl/video/CCUDASharedMemory.cpp
+++ b/src/nbl/video/CCUDASharedMemory.cpp
@@ -77,16 +77,12 @@ core::smart_refctd_ptr<IGPUBuffer> CCUDASharedMemory::exportAsBuffer(ILogicalDev
 
 #endif
 
-core::smart_refctd_ptr<IGPUImage>  CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const
+core::smart_refctd_ptr<IGPUImage>  CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const
 {
 	if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice()))
 		return nullptr;
 
-	auto img = device->createImage({
-		std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }},
-		IGPUImage::TILING::LINEAR,
-		1 /*preinitialized*/,
-	});
+	auto img = device->createImage(std::move(params));
 	
 	if (exportAsMemory(device, img.get()))
 		return img;
diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h
index 4596981c2a..988d50c2ec 100644
--- a/src/nbl/video/CVulkanBuffer.h
+++ b/src/nbl/video/CVulkanBuffer.h
@@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked<IGPUBuffer>
        using base_t = CVulkanDeviceMemoryBacked<IGPUBuffer>;
 
     public:
-        inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {}
+        inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev,std::move(creationParams), dedicatedOnly, buffer) {}
     
         void setObjectDebugName(const char* label) const override;
 
diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
index 2bec9e9d06..8f08f9aa67 100644
--- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
+++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
@@ -6,7 +6,7 @@ namespace nbl::video
 {
 
 template<class Interface>
-IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked<Interface>::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle)
+IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked<Interface>::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle)
 {    
     const std::conditional_t<IsImage,VkImageMemoryRequirementsInfo2,VkBufferMemoryRequirementsInfo2> vk_memoryRequirementsInfo = {
         IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle
@@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked<Interfa
     memoryReqs.size = vk_memoryRequirements.memoryRequirements.size;
     memoryReqs.memoryTypeBits = vk_memoryRequirements.memoryRequirements.memoryTypeBits;
     memoryReqs.alignmentLog2 = std::log2(vk_memoryRequirements.memoryRequirements.alignment);
-    memoryReqs.prefersDedicatedAllocation = vk_dedicatedMemoryRequirements.prefersDedicatedAllocation;
-    memoryReqs.requiresDedicatedAllocation = vk_dedicatedMemoryRequirements.requiresDedicatedAllocation;
+    memoryReqs.prefersDedicatedAllocation  = dedicatedOnly | vk_dedicatedMemoryRequirements.prefersDedicatedAllocation;
+    memoryReqs.requiresDedicatedAllocation = dedicatedOnly | vk_dedicatedMemoryRequirements.requiresDedicatedAllocation;
     return memoryReqs;
 }
 
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 39b61d7c53..a5b885849b 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -378,7 +378,7 @@ core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUB
     VkBuffer vk_buffer;
     if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS)
         return nullptr;
-    return core::make_smart_refctd_ptr<CVulkanBuffer>(this,std::move(creationParams),vk_buffer);
+    return core::make_smart_refctd_ptr<CVulkanBuffer>(this,std::move(creationParams), dedicatedOnly, vk_buffer);
 }
 
 core::smart_refctd_ptr<IGPUBufferView> CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange<const IGPUBuffer>& underlying, const asset::E_FORMAT _fmt)
@@ -399,17 +399,24 @@ core::smart_refctd_ptr<IGPUBufferView> CVulkanLogicalDevice::createBufferView_im
 
 core::smart_refctd_ptr<IGPUImage> CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params)
 {
-    VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr };
-    vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value,true);
+    VkExternalMemoryImageCreateInfo  externalMemoryInfo = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
+        .handleTypes = params.externalHandleTypes.value,
+    };
+
+    const bool external = params.externalHandleTypes.value;
+
+    VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, &externalMemoryInfo };
+    vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value, true);
 
-    std::array<VkFormat,asset::E_FORMAT::EF_COUNT> vk_formatList;
+    std::array<VkFormat, asset::E_FORMAT::EF_COUNT> vk_formatList;
     VkImageFormatListCreateInfo vk_formatListStruct = { VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO, &vk_stencilUsage };
     vk_formatListStruct.viewFormatCount = 0u;
     // if only there existed a nice iterator that would let me iterate over set bits 64 faster
     if (params.viewFormats.any())
-    for (auto fmt=0; fmt<vk_formatList.size(); fmt++)
-    if (params.viewFormats.test(fmt))
-        vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast<asset::E_FORMAT>(fmt));
+        for (auto fmt = 0; fmt < vk_formatList.size(); fmt++)
+            if (params.viewFormats.test(fmt))
+                vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast<asset::E_FORMAT>(fmt));
     vk_formatListStruct.pViewFormats = vk_formatList.data();
 
     VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct };
@@ -421,16 +428,45 @@ core::smart_refctd_ptr<IGPUImage> CVulkanLogicalDevice::createImage_impl(IGPUIma
     vk_createInfo.arrayLayers = params.arrayLayers;
     vk_createInfo.samples = static_cast<VkSampleCountFlagBits>(params.samples);
     vk_createInfo.tiling = static_cast<VkImageTiling>(params.tiling);
-    vk_createInfo.usage = getVkImageUsageFlagsFromImageUsageFlags(params.usage.value,asset::isDepthOrStencilFormat(params.format));
-    vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE;
+    vk_createInfo.usage = getVkImageUsageFlagsFromImageUsageFlags(params.usage.value, asset::isDepthOrStencilFormat(params.format));
+    vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE;
     vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount;
     vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices;
-    vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED;
+    vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED;
+
+    bool dedicatedOnly = false;
+    if (external)
+    {
+        core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> requestedTypes = params.externalHandleTypes;
+        auto pd = dynamic_cast<const CVulkanPhysicalDevice*>(m_physicalDevice)->getInternalObject();
+        while (const auto idx = hlsl::findLSB(static_cast<uint32_t>(requestedTypes.value)) + 1)
+        {
+            const auto handleType = static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(1u << (idx - 1));
+            requestedTypes ^= handleType;
+
+            auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType);
+
+            if (props.maxArrayLayers < vk_createInfo.arrayLayers || 
+                !core::bitflag<IGPUImage::E_SAMPLE_COUNT_FLAGS>(props.sampleCounts).hasFlags(params.samples) ||
+                /* props.maxResourceSize?? */
+                props.maxExtent.width < vk_createInfo.extent.width ||
+                props.maxExtent.height < vk_createInfo.extent.height ||
+                props.maxExtent.depth < vk_createInfo.extent.depth)
+            {
+                return nullptr;
+            }
+
+            if (!core::bitflag(static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types
+                return nullptr;
+
+            dedicatedOnly |= props.dedicatedOnly;
+        }
+    }
 
     VkImage vk_image;
-    if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS)
+    if (m_devf.vk.vkCreateImage(m_vkdev, &vk_createInfo, nullptr, &vk_image) != VK_SUCCESS)
         return nullptr;
-    return core::make_smart_refctd_ptr<CVulkanImage>(this,std::move(params),vk_image);
+    return core::make_smart_refctd_ptr<CVulkanImage>(this, std::move(params), dedicatedOnly, vk_image);
 }
 
 core::smart_refctd_ptr<IGPUImageView> CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params)
diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h
index 56069a3dd4..9cfebccd3f 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.h
+++ b/src/nbl/video/CVulkanPhysicalDevice.h
@@ -136,6 +136,7 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice
         SExternalImageFormatProperties getExternalImageProperties_impl(
             asset::E_FORMAT format, 
             IGPUImage::TILING tiling, 
+            IGPUImage::E_TYPE type,
             core::bitflag<IGPUImage::E_USAGE_FLAGS> usage, 
             core::bitflag<IGPUImage::E_CREATE_FLAGS> flags,  
             IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override
@@ -150,7 +151,8 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice
             VkPhysicalDeviceImageFormatInfo2 info = {
                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
                 .pNext = &extInfo,
-                .format = static_cast<VkFormat>(format),
+                .format = getVkFormatFromFormat(format),
+                .type  = static_cast<VkImageType>(type),
                 .tiling = static_cast<VkImageTiling>(tiling),
                 .usage = usage.value,
                 .flags = flags.value,
@@ -163,7 +165,9 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice
                 .pNext = &externalProps,
             };
 
-            vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props);
+            VkResult re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &info, &props);
+            if(VK_SUCCESS != re)
+                return {};
 
             return 
                 { 

From 3d9a5309206fe219180d3610d3fb82ac93c4458c Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 18 Jan 2024 19:11:02 +0300
Subject: [PATCH 56/62] address pr comments

---
 include/nbl/video/CCUDADevice.h             | 19 ++++++-------
 include/nbl/video/CCUDASharedMemory.h       |  5 ++--
 include/nbl/video/IDeviceMemoryAllocation.h | 27 +++++++++++-------
 include/nbl/video/IDeviceMemoryAllocator.h  | 16 +++--------
 include/nbl/video/IDeviceMemoryBacked.h     |  2 +-
 include/nbl/video/ILogicalDevice.h          | 31 ++++++---------------
 include/nbl/video/ISemaphore.h              |  4 +--
 include/nbl/video/SPhysicalDeviceLimits.h   |  4 ---
 include/nbl/video/utilities/IUtilities.h    |  2 +-
 9 files changed, 44 insertions(+), 66 deletions(-)

diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 7b2b952548..551c2a7e5b 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -37,6 +37,13 @@ class CCUDADevice : public core::IReferenceCounted
 		static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD;
 		static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #endif
+		struct SCUDACleaner : video::ICleanup
+		{
+			core::smart_refctd_ptr<const core::IReferenceCounted> resource;
+			SCUDACleaner(core::smart_refctd_ptr<const core::IReferenceCounted> resource)
+				: resource(std::move(resource))
+			{ }
+		};
 
 		enum E_VIRTUAL_ARCHITECTURE
 		{
@@ -95,18 +102,10 @@ class CCUDADevice : public core::IReferenceCounted
 	protected:
 		CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory);
 
+
+		// CUDAHandler creates CUDADevice, it needs to access ctor
 		friend class CCUDAHandler;
-		friend class CCUDASharedMemory;
-		friend class CCUDASharedSemaphore;
 
-		struct SCUDACleaner : video::ICleanup
-		{
-			core::smart_refctd_ptr<const core::IReferenceCounted> resource;
-			SCUDACleaner(core::smart_refctd_ptr<const core::IReferenceCounted> resource)
-				: resource(std::move(resource))
-			{ }
-		};
-		
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _handle, core::smart_refctd_ptr<CCUDAHandler>&& _handler);
 		~CCUDADevice();
 		
diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h
index 117a1ff4b5..d900087d06 100644
--- a/include/nbl/video/CCUDASharedMemory.h
+++ b/include/nbl/video/CCUDASharedMemory.h
@@ -23,6 +23,7 @@ namespace nbl::video
 class CCUDASharedMemory : public core::IReferenceCounted
 {
 public:
+    // required for us to see the move ctor
     friend class CCUDADevice;
 
     CUdeviceptr getDeviceptr() const { return m_params.ptr;  }
@@ -49,11 +50,11 @@ class CCUDASharedMemory : public core::IReferenceCounted
 
     core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
 
-    core::smart_refctd_ptr<IGPUImage>  createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const;
+    core::smart_refctd_ptr<IGPUImage>  createAndBindImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const;
 
 protected:
 
-    CCUDASharedMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params)
+    CCUDASharedMemory(core::smart_refctd_ptr<CCUDADevice>&& device, SCachedCreationParams&& params)
         : m_device(std::move(device))
         , m_params(std::move(params))
     {}
diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h
index 28ad0dcfa3..9ca663b9ea 100644
--- a/include/nbl/video/IDeviceMemoryAllocation.h
+++ b/include/nbl/video/IDeviceMemoryAllocation.h
@@ -164,14 +164,21 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
         //! Constant variant of getMappedPointer
         inline const void* getMappedPointer() const { return m_mappedPtr; }
 
-        struct SCreationParams
+        struct SInfo
+        {
+            uint64_t allocationSize = 0;
+            core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags = IDeviceMemoryAllocation::EMAF_NONE;
+			// Handle Type for external resources
+			IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE;
+			//! Imports the given handle  if externalHandle != nullptr && externalHandleType != EHT_NONE
+			//! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE
+			void* externalHandle = nullptr;
+        };
+
+        struct SCreationParams: SInfo
         {
-            core::bitflag<E_MEMORY_ALLOCATE_FLAGS> allocateFlags = E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE;
             core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE;
-            E_EXTERNAL_HANDLE_TYPE externalHandleType = E_EXTERNAL_HANDLE_TYPE::EHT_NONE;
-            void* externalHandle = nullptr;
             const bool dedicated = false;
-            const size_t allocationSize;
         };
 
     protected:
@@ -183,10 +190,10 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
         IDeviceMemoryAllocation(
             const ILogicalDevice* originDevice, SCreationParams&& params = {})
             : m_originDevice(originDevice)
+            , m_params(std::move(params))
             , m_mappedPtr(nullptr)
             , m_mappedRange{ 0, 0 }
             , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS)
-            , m_params(std::move(params))
         {}
 
         virtual void* map_impl(const MemoryRange& range, const core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> accessHint) = 0;
@@ -194,10 +201,10 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
 
 
         const ILogicalDevice* m_originDevice = nullptr;
-        uint8_t* m_mappedPtr;
-        MemoryRange m_mappedRange;
-        core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> m_currentMappingAccess;
-        SCreationParams m_params;
+        SCreationParams m_params = {};
+        uint8_t* m_mappedPtr = nullptr;
+        MemoryRange m_mappedRange = {};
+        core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS;
         std::unique_ptr<struct ICleanup> m_postDestroyCleanup = nullptr;
 };
 
diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h
index 408efd6da4..22ea3c8238 100644
--- a/include/nbl/video/IDeviceMemoryAllocator.h
+++ b/include/nbl/video/IDeviceMemoryAllocator.h
@@ -12,19 +12,11 @@ namespace nbl::video
 class IDeviceMemoryAllocator
 {
 	public:
-		struct SAllocateInfo
+		struct SAllocateInfo: IDeviceMemoryAllocation::SInfo
 		{
-			size_t size : 54 = 0ull;
-			size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS
-			size_t memoryTypeIndex : 5 = 0u;
+			uint32_t memoryTypeIndex = 0u;
 			IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan
 			// size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications.
-
-			// Handle Type for external resources
-			IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE;
-			//! Imports the given handle  if externalHandle != nullptr && externalHandleType != EHT_NONE
-			//! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE
-			void* externalHandle = nullptr;
 		};
 
 		//! IMemoryTypeIterator extracts memoryType indices from memoryTypeBits in arbitrary order
@@ -54,8 +46,8 @@ class IDeviceMemoryAllocator
 				inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication)
 				{
 					SAllocateInfo ret = {};
-					ret.size = m_reqs.size;
-					ret.flags = m_allocateFlags;
+					ret.allocationSize = m_reqs.size;
+					ret.allocateFlags = core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS>(m_allocateFlags);
 					ret.memoryTypeIndex = dereference();
 					ret.dedication = dedication;
 					ret.externalHandleType = m_handleType;
diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index 278e681a35..0071a53d71 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -126,7 +126,7 @@ class IDeviceMemoryBacked : public IBackendObject
 
         //! members
         SCachedCreationParams m_cachedCreationParams;
-        SDeviceMemoryRequirements m_cachedMemoryReqs;
+        const SDeviceMemoryRequirements m_cachedMemoryReqs;
         void* m_cachedExternalHandle = nullptr;
 };
 
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 49585f3413..a102005371 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -147,7 +147,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual IQueue::RESULT waitIdle() const = 0;
 
         //! Semaphore Stuff
-        virtual core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&&) = 0;
+        virtual core::smart_refctd_ptr<ISemaphore> createSemaphore(uint64_t initialValue = 0, ISemaphore::SCreationParams&& = {}) = 0;
         virtual ISemaphore::WAIT_RESULT waitForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll, const uint64_t timeout) = 0;
         // Forever waiting variant if you're confident that the fence will eventually be signalled
         inline ISemaphore::WAIT_RESULT blockForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll=true)
@@ -285,29 +285,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
 
         //! Descriptor Creation
         // Buffer (@see ICPUBuffer)
-        inline core::smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams&& creationParams)
-        {
-            const auto maxSize = getPhysicalDeviceLimits().maxBufferSize;
-            if (creationParams.size>maxSize)
-            {
-                m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize);
-                return nullptr;
-            }
-            return createBuffer_impl(std::move(creationParams));
-        }
+        core::smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams&& creationParams);
+
         // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView)
         core::smart_refctd_ptr<IGPUBufferView> createBufferView(const asset::SBufferRange<const IGPUBuffer>& underlying, const asset::E_FORMAT _fmt);
+
         // Creates an Image (@see ICPUImage)
-        inline core::smart_refctd_ptr<IGPUImage> createImage(IGPUImage::SCreationParams&& creationParams)
-        {
-            if (!IGPUImage::validateCreationParameters(creationParams))
-            {
-                m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR);
-                return nullptr;
-            }
-            // TODO: @Cyprian validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage
-            return createImage_impl(std::move(creationParams));
-        }
+        core::smart_refctd_ptr<IGPUImage> createImage(IGPUImage::SCreationParams&& params);
+
         // Create an ImageView that can actually be used by shaders (@see ICPUImageView)
         inline core::smart_refctd_ptr<IGPUImageView> createImageView(IGPUImageView::SCreationParams&& params)
         {
@@ -765,9 +750,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0;
         virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0;
 
-        virtual core::smart_refctd_ptr<IGPUBuffer> createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0;
+        virtual core::smart_refctd_ptr<IGPUBuffer> createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0;
         virtual core::smart_refctd_ptr<IGPUBufferView> createBufferView_impl(const asset::SBufferRange<const IGPUBuffer>& underlying, const asset::E_FORMAT _fmt) = 0;
-        virtual core::smart_refctd_ptr<IGPUImage> createImage_impl(IGPUImage::SCreationParams&& params) = 0;
+        virtual core::smart_refctd_ptr<IGPUImage> createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0;
         virtual core::smart_refctd_ptr<IGPUImageView> createImageView_impl(IGPUImageView::SCreationParams&& params) = 0;
         virtual core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure> createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0;
         virtual core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0;
diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h
index 0b14590e83..5434591fb6 100644
--- a/include/nbl/video/ISemaphore.h
+++ b/include/nbl/video/ISemaphore.h
@@ -69,8 +69,6 @@ class ISemaphore : public IBackendObject
             //! Imports the given handle  if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE
             //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE
             void* externalHandle = nullptr;
-
-            uint64_t initialValue = 0;
         };
 
         auto const& getCreationParams() const
@@ -85,7 +83,7 @@ class ISemaphore : public IBackendObject
         {}
         virtual ~ISemaphore() = default;
 
-        SCreationParams m_creationParams;
+        const SCreationParams m_creationParams;
 };
 
 }
diff --git a/include/nbl/video/SPhysicalDeviceLimits.h b/include/nbl/video/SPhysicalDeviceLimits.h
index c3e13f145b..b639f37230 100644
--- a/include/nbl/video/SPhysicalDeviceLimits.h
+++ b/include/nbl/video/SPhysicalDeviceLimits.h
@@ -552,10 +552,6 @@ struct SPhysicalDeviceLimits
     /* CooperativeMatrixPropertiesKHR  *//* VK_KHR_cooperative_matrix */
     core::bitflag<asset::IShader::E_SHADER_STAGE> cooperativeMatrixSupportedStages = asset::IShader::ESS_UNKNOWN;
 
-    bool externalFenceWin32 = false; /* VK_KHR_external_fence_win32 */ // [TODO] requires instance extensions, add them
-    bool externalMemoryWin32 = false; /* VK_KHR_external_memory_win32 */ // [TODO] requires instance extensions, add them
-    bool externalSemaphoreWin32 = false; /* VK_KHR_external_semaphore_win32 */ // [TODO] requires instance extensions, add them
-
     /*  Always enabled if available, reported as limits */
 
     // Core 1.0 Features
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 983c2ab277..d91fe09107 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -234,7 +234,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         //! WARNING: This function blocks CPU and stalls the GPU!
         inline bool autoSubmitAndBlock(const SIntendedSubmitInfo::SFrontHalf& submit, const std::function<bool(SIntendedSubmitInfo&)>& what)
         {            
-            auto semaphore = m_device->createSemaphore(ISemaphore::SCreationParams{.initialValue=0});
+            auto semaphore = m_device->createSemaphore();
             // so we begin latching everything on the value of 1, but if we overflow it increases
             IQueue::SSubmitInfo::SSemaphoreInfo info = {semaphore.get(),1};
 

From 4d174e530335fe3d7ac3a1e9c66dca3d466fcea5 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 18 Jan 2024 20:10:19 +0300
Subject: [PATCH 57/62] last commit part 2

---
 src/nbl/video/CCUDADevice.cpp           |   2 +-
 src/nbl/video/CCUDASharedMemory.cpp     |  46 ++---------
 src/nbl/video/CVulkanImage.cpp          |   1 -
 src/nbl/video/CVulkanLogicalDevice.cpp  | 100 +++++++-----------------
 src/nbl/video/CVulkanLogicalDevice.h    |   6 +-
 src/nbl/video/CVulkanPhysicalDevice.cpp |   4 +-
 src/nbl/video/IGPUCommandBuffer.cpp     |   2 +-
 src/nbl/video/ILogicalDevice.cpp        |  70 +++++++++++++++++
 8 files changed, 110 insertions(+), 121 deletions(-)

diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 39faaaa0ed..9fbb635f52 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -139,7 +139,7 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr<CCUDASharedSemap
 
 	if (!handleType.hasFlags(ISemaphore::EHT_OPAQUE_WIN32) || !handle)
 		return CUDA_ERROR_INVALID_VALUE;
-
+    
 	CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = {
 		.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32,
 		.handle = {.win32 = {.handle = handle }},
diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp
index 82d6f496cd..6510967271 100644
--- a/src/nbl/video/CCUDASharedMemory.cpp
+++ b/src/nbl/video/CCUDASharedMemory.cpp
@@ -11,9 +11,11 @@ namespace nbl::video
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
 {
 	IDeviceMemoryAllocator::SAllocateInfo info = {
-		.size = m_params.granularSize,
-		.externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE,
-		.externalHandle = m_params.osHandle,
+		{
+			.allocationSize = m_params.granularSize,
+			.externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE,
+			.externalHandle = m_params.osHandle,
+		}
 	};
 
 	auto pd = device->getPhysicalDevice();
@@ -43,46 +45,12 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDASharedMemory::exportAsMemor
 		std::make_unique<CCUDADevice::SCUDACleaner>(core::smart_refctd_ptr<const CCUDASharedMemory>(this))).memory;
 }
 
-#if 0
-core::smart_refctd_ptr<IGPUBuffer> CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> usage) const
+core::smart_refctd_ptr<IGPUImage>  CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const
 {
 	if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice()))
 		return nullptr;
 
-	auto buf = device->createBuffer({{
-			.size = m_params.granularSize,
-			.usage = usage }, {{
-			.postDestroyCleanup = std::make_unique<CCUDADevice::SCUDACleaner>(core::smart_refctd_ptr<const CCUDASharedMemory>(this)),
-			.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE,
-			.externalHandle = m_params.osHandle
-		}}});
-
-	auto req = buf->getMemoryReqs();
-	auto pd = device->getPhysicalDevice();
-	switch (m_params.location)
-	{
-	case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break;
-	case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break;
-	// TODO(Atil): Figure out how to handle these
-	case CU_MEM_LOCATION_TYPE_HOST_NUMA: 
-	case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: 
-	default: break;
-	}
-
-	if (!device->allocate(req, buf.get()).isValid())
-		return nullptr;
-
-	return buf;
-}
-
-#endif
-
-core::smart_refctd_ptr<IGPUImage>  CCUDASharedMemory::createAndBindImage(ILogicalDevice* device, IGPUImage::SCreationParams&& params) const
-{
-	if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice()))
-		return nullptr;
-
-	auto img = device->createImage(std::move(params));
+	auto img = device->createImage({ std::move(params), { {.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE } }, IGPUImage::TILING::LINEAR });
 	
 	if (exportAsMemory(device, img.get()))
 		return img;
diff --git a/src/nbl/video/CVulkanImage.cpp b/src/nbl/video/CVulkanImage.cpp
index 748f21720b..ff5e2dfcb9 100644
--- a/src/nbl/video/CVulkanImage.cpp
+++ b/src/nbl/video/CVulkanImage.cpp
@@ -9,7 +9,6 @@ namespace nbl::video
 CVulkanImage::~CVulkanImage()
 {
     preDestroyStep();
-    // don't destroy imported handles
     if (!m_cachedCreationParams.skipHandleDestroy)
     {
         const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index a5b885849b..afedf60786 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -45,7 +45,7 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr<const IAPIConn
 }
 
 
-core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(ISemaphore::SCreationParams&& params)
+core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams&& params)
 {
     VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR };
     VkExportSemaphoreWin32HandleInfoKHR handleInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL };
@@ -54,7 +54,7 @@ core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(ISemaph
     VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR };
     type.pNext = params.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR
     type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
-    type.initialValue = params.initialValue;
+    type.initialValue = initialValue;
 
     VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, &type };
     createInfo.flags = static_cast<VkSemaphoreCreateFlags>(0); // flags must be 0
@@ -150,16 +150,15 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
     if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount)
         return ret;
 
-    const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags(info.flags);
     VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr };
     {
-        if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT))
+        if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT))
             vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT;
         vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now
     }
     VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr};
     VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo };
-    vk_allocateInfo.allocationSize = info.size;
+    vk_allocateInfo.allocationSize = info.allocationSize;
     vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex;
 
     VkImportMemoryWin32HandleInfoKHR importInfo = { 
@@ -168,13 +167,26 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
         .handle = info.externalHandle
     };
 
+    VkExportMemoryWin32HandleInfoKHR handleInfo = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR,
+        .dwAccess = GENERIC_ALL,
+    };
+
+    VkExportMemoryAllocateInfo exportInfo = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+        .pNext = &exportInfo,
+        .handleTypes = static_cast<VkExternalMemoryHandleTypeFlags>(info.externalHandleType),
+    };
+    
     const void** pNext = &vk_allocateFlagsInfo.pNext;
 
     if (info.externalHandleType)
     {
-        // Importing
-        *pNext = &importInfo;
-        pNext = &importInfo.pNext;
+        if (info.externalHandle) //importing
+            *pNext = &importInfo;
+        else // exporting
+            *pNext = &exportInfo;
+        pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext;
     }
 
     if(info.dedication)
@@ -207,15 +219,8 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
     // automatically allocation goes out of scope and frees itself if no success later on
     const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags;
 
-    CVulkanMemoryAllocation::SCreationParams params = {
-        .allocateFlags = allocateFlags,
-        .memoryPropertyFlags = memoryPropertyFlags,
-        .externalHandleType = info.externalHandleType,
-        .externalHandle = info.externalHandle,
-        .dedicated = !!info.dedication,
-        .allocationSize = info.size,
-    };
-    
+    CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication };
+      
     ret.memory = core::make_smart_refctd_ptr<CVulkanMemoryAllocation>(this,vk_deviceMemory, std::move(params));
     ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator
     if(info.dedication)
@@ -334,7 +339,7 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin
 }
 
 
-core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams)
+core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly)
 {
     VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
     // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkBufferDeviceAddressCreateInfoEXT, VkBufferOpaqueCaptureAddressCreateInfo, VkDedicatedAllocationBufferCreateInfoNV, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR
@@ -344,9 +349,8 @@ core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUB
        .handleTypes = creationParams.externalHandleTypes.value,
     };
 
-    const bool external = creationParams.externalHandleTypes.value;
 
-    vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr;
+    vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr;
     vk_createInfo.flags = static_cast<VkBufferCreateFlags>(0u); // Nabla doesn't support any of these flags
     vk_createInfo.size = static_cast<VkDeviceSize>(creationParams.size);
     vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage);
@@ -354,26 +358,6 @@ core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUB
     vk_createInfo.queueFamilyIndexCount = creationParams.queueFamilyIndexCount;
     vk_createInfo.pQueueFamilyIndices = creationParams.queueFamilyIndices;
 
-    bool dedicatedOnly = false;
-
-    if (external)
-    {
-        core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> requestedTypes = creationParams.externalHandleTypes;
-
-        while (const auto idx = hlsl::findLSB(static_cast<uint32_t>(requestedTypes.value)) + 1)
-        {
-            const auto handleType = static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(1u << (idx - 1));
-            requestedTypes ^= handleType;
-
-            auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType);
-
-            if (!core::bitflag(static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types
-                return nullptr;
-
-            // TODO: Handle this
-            dedicatedOnly = props.dedicatedOnly;
-        }
-    }
 
     VkBuffer vk_buffer;
     if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS)
@@ -397,15 +381,13 @@ core::smart_refctd_ptr<IGPUBufferView> CVulkanLogicalDevice::createBufferView_im
     return nullptr;
 }
 
-core::smart_refctd_ptr<IGPUImage> CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params)
+core::smart_refctd_ptr<IGPUImage> CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly)
 {
     VkExternalMemoryImageCreateInfo  externalMemoryInfo = {
         .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
         .handleTypes = params.externalHandleTypes.value,
     };
-
-    const bool external = params.externalHandleTypes.value;
-
+ 
     VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, &externalMemoryInfo };
     vk_stencilUsage.stencilUsage = getVkImageUsageFlagsFromImageUsageFlags(params.actualStencilUsage().value, true);
 
@@ -434,35 +416,7 @@ core::smart_refctd_ptr<IGPUImage> CVulkanLogicalDevice::createImage_impl(IGPUIma
     vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices;
     vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED;
 
-    bool dedicatedOnly = false;
-    if (external)
-    {
-        core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> requestedTypes = params.externalHandleTypes;
-        auto pd = dynamic_cast<const CVulkanPhysicalDevice*>(m_physicalDevice)->getInternalObject();
-        while (const auto idx = hlsl::findLSB(static_cast<uint32_t>(requestedTypes.value)) + 1)
-        {
-            const auto handleType = static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(1u << (idx - 1));
-            requestedTypes ^= handleType;
-
-            auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType);
-
-            if (props.maxArrayLayers < vk_createInfo.arrayLayers || 
-                !core::bitflag<IGPUImage::E_SAMPLE_COUNT_FLAGS>(props.sampleCounts).hasFlags(params.samples) ||
-                /* props.maxResourceSize?? */
-                props.maxExtent.width < vk_createInfo.extent.width ||
-                props.maxExtent.height < vk_createInfo.extent.height ||
-                props.maxExtent.depth < vk_createInfo.extent.depth)
-            {
-                return nullptr;
-            }
-
-            if (!core::bitflag(static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types
-                return nullptr;
-
-            dedicatedOnly |= props.dedicatedOnly;
-        }
-    }
-
+   
     VkImage vk_image;
     if (m_devf.vk.vkCreateImage(m_vkdev, &vk_createInfo, nullptr, &vk_image) != VK_SUCCESS)
         return nullptr;
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 0df38ffd67..f18fb3dad4 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -52,7 +52,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice
             return CVulkanQueue::getResultFrom(m_devf.vk.vkDeviceWaitIdle(m_vkdev));
         }
             
-        core::smart_refctd_ptr<ISemaphore> createSemaphore(ISemaphore::SCreationParams&&) override;
+        core::smart_refctd_ptr<ISemaphore> createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams &&) override;
         ISemaphore::WAIT_RESULT waitForSemaphores(const std::span<const ISemaphore::SWaitInfo> infos, const bool waitAll, const uint64_t timeout) override;
             
         core::smart_refctd_ptr<IEvent> createEvent(const IEvent::CREATE_FLAGS flags) override;
@@ -103,9 +103,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice
         bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override;
 
         // descriptor creation
-        core::smart_refctd_ptr<IGPUBuffer> createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override;
+        core::smart_refctd_ptr<IGPUBuffer> createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override;
         core::smart_refctd_ptr<IGPUBufferView> createBufferView_impl(const asset::SBufferRange<const IGPUBuffer>& underlying, const asset::E_FORMAT _fmt) override;
-        core::smart_refctd_ptr<IGPUImage> createImage_impl(IGPUImage::SCreationParams&& params) override;
+        core::smart_refctd_ptr<IGPUImage> createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override;
         core::smart_refctd_ptr<IGPUImageView> createImageView_impl(IGPUImageView::SCreationParams&& params) override;
         VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr);
         inline core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure> createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override
diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp
index ecfdaa6f42..62dcde7d42 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.cpp
+++ b/src/nbl/video/CVulkanPhysicalDevice.cpp
@@ -1204,9 +1204,7 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
         if (isExtensionSupported(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME))
             properties.limits.cooperativeMatrixRobustness = cooperativeMatrixFeatures.robustness;
 #endif
-        properties.limits.externalFenceWin32 = isExtensionSupported(VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME);
-        properties.limits.externalMemoryWin32 = isExtensionSupported(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
-        properties.limits.externalSemaphoreWin32 = isExtensionSupported(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
+
     }
 
     // we compare all limits against the defaults easily!
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index b919e0333a..4eb12e30e3 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -305,7 +305,7 @@ bool IGPUCommandBuffer::waitEvents(const std::span<IEvent*> events, const SEvent
 
 bool IGPUCommandBuffer::pipelineBarrier(const core::bitflag<asset::E_DEPENDENCY_FLAGS> dependencyFlags, const SPipelineBarrierDependencyInfo& depInfo)
 {
-    if (!checkStateBeforeRecording(~queue_flags_t::NONE))
+    if (!checkStateBeforeRecording(/*everything is allowed*/))
         return false;
 
     if (depInfo.memBarriers.empty() && depInfo.bufBarriers.empty() && depInfo.imgBarriers.empty())
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 97030ccbba..5ac47d81d7 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -647,4 +647,74 @@ bool ILogicalDevice::createGraphicsPipelines(
     if (!output[i])
         return false;
     return true;
+}
+
+core::smart_refctd_ptr<IGPUBuffer> ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams)
+{
+    const auto maxSize = getPhysicalDeviceLimits().maxBufferSize;
+    if (creationParams.size > maxSize)
+    {
+        m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize);
+        return nullptr;
+    }
+
+    bool dedicatedOnly = false;
+    if (creationParams.externalHandleTypes.value)
+    {
+        core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> requestedTypes = creationParams.externalHandleTypes;
+
+        while (const auto idx = hlsl::findLSB(static_cast<uint32_t>(requestedTypes.value)) + 1)
+        {
+            const auto handleType = static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(1u << (idx - 1));
+            requestedTypes ^= handleType;
+
+            auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType);
+
+            if (!core::bitflag(static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(props.compatibleTypes)).hasFlags(creationParams.externalHandleTypes)) // incompatibility between requested types
+                return nullptr;
+
+            dedicatedOnly |= props.dedicatedOnly;
+        }
+    }
+    return createBuffer_impl(std::move(creationParams), dedicatedOnly);
+}
+
+core::smart_refctd_ptr<IGPUImage> ILogicalDevice::createImage(IGPUImage::SCreationParams&& params)
+{
+    if (!IGPUImage::validateCreationParameters(params))
+    {
+        m_logger.log("Failed to create Image, invalid creation parameters!", system::ILogger::ELL_ERROR);
+        return nullptr;
+    }
+
+    const bool external = params.externalHandleTypes.value;
+    bool dedicatedOnly = false;
+    if (external)
+    {
+        core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> requestedTypes = params.externalHandleTypes;
+        while (const auto idx = hlsl::findLSB(static_cast<uint32_t>(requestedTypes.value)) + 1)
+        {
+            const auto handleType = static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(1u << (idx - 1));
+            requestedTypes ^= handleType;
+
+            auto props = m_physicalDevice->getExternalImageProperties(params.format, params.tiling, params.type, params.usage, params.flags, handleType);
+
+            if (props.maxArrayLayers < params.arrayLayers ||
+                !core::bitflag<IGPUImage::E_SAMPLE_COUNT_FLAGS>(props.sampleCounts).hasFlags(params.samples) ||
+                /* props.maxResourceSize?? */
+                props.maxExtent.width < params.extent.width ||
+                props.maxExtent.height < params.extent.height ||
+                props.maxExtent.depth < params.extent.depth)
+            {
+                return nullptr;
+            }
+
+            if (!core::bitflag(static_cast<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE>(props.compatibleTypes)).hasFlags(params.externalHandleTypes)) // incompatibility between requested types
+                return nullptr;
+
+            dedicatedOnly |= props.dedicatedOnly;
+        }
+    }
+    // TODO: @Cyprian validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage
+    return createImage_impl(std::move(params), dedicatedOnly);
 }
\ No newline at end of file

From cbd18f482ce0a32c513f4e5ee95281b7a157b0a2 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 18 Jan 2024 23:28:02 +0300
Subject: [PATCH 58/62] add missing cuda fn & map queue indices to vk

---
 include/nbl/video/CCUDAHandler.h       |  1 +
 src/nbl/video/CVulkanCommandBuffer.cpp | 28 ++++++++++++++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index dbad47877d..44b6766e40 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -136,6 +136,7 @@ class CCUDAHandler : public core::IReferenceCounted
 			,cuDestroyExternalSemaphore
 			,cuImportExternalSemaphore
 			,cuSignalExternalSemaphoresAsync
+			,cuWaitExternalSemaphoresAsync
 		);
 		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
 
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index 2b1f9d9070..64ec5f68c0 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -48,25 +48,41 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily
     // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkBufferMemoryBarrier2-buffer-04088
     if (concurrentSharing)
         selfQueueFamilyIndex = IQueue::FamilyIgnored;
+
+    auto mapQFIdx = [](uint32_t idx)
+    {
+        switch (idx)
+        {
+        case IQueue::FamilyExternal:
+        case IQueue::FamilyIgnored:
+        case IQueue::FamilyForeign:
+            idx |= 1u << 31;
+            break;
+        }
+        return idx;
+    };
+
     if constexpr (!std::is_same_v<vk_barrier_t,VkMemoryBarrier2>)
     {
-        out.srcQueueFamilyIndex = selfQueueFamilyIndex;
-        out.dstQueueFamilyIndex = selfQueueFamilyIndex;
+        out.srcQueueFamilyIndex = mapQFIdx(selfQueueFamilyIndex);
+        out.dstQueueFamilyIndex = mapQFIdx(selfQueueFamilyIndex);
     }
     const asset::SMemoryBarrier* memoryBarrier;
     if constexpr (std::is_same_v<IGPUCommandBuffer::SOwnershipTransferBarrier,ResourceBarrier>)
     {
         memoryBarrier = &in.dep;
         // in.otherQueueFamilyIndex==selfQueueFamilyIndex not resulting in ownership transfer is implicit
-        if (!concurrentSharing && in.otherQueueFamilyIndex!=IQueue::FamilyIgnored)
-        switch (in.ownershipOp)
+        if (!concurrentSharing && in.otherQueueFamilyIndex != IQueue::FamilyIgnored)
         {
+            switch (in.ownershipOp)
+            {
             case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE:
-                out.dstQueueFamilyIndex = in.otherQueueFamilyIndex;
+                out.dstQueueFamilyIndex = mapQFIdx(in.otherQueueFamilyIndex);
                 break;
             case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE:
-                out.srcQueueFamilyIndex = in.otherQueueFamilyIndex;
+                out.srcQueueFamilyIndex = mapQFIdx(in.otherQueueFamilyIndex);
                 break;
+            }
         }
     }
     else

From 23fe8d4518452028c36f2343886b51c9d08ff6c1 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 18 Jan 2024 23:28:13 +0300
Subject: [PATCH 59/62] update submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 8cd78a71f4..744dd44c3b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8cd78a71f4a03c7ace9df2ee9b9df07317779909
+Subproject commit 744dd44c3bd6d5bb5734402b85f49fd0e27a46cc

From c32fd793f028f3c8974828f5df605e281bb0ce1d Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Fri, 19 Jan 2024 01:12:01 +0300
Subject: [PATCH 60/62] cache cuda devices

---
 examples_tests                   |   2 +-
 include/nbl/video/CCUDAHandler.h |  28 +++--
 src/nbl/video/CCUDAHandler.cpp   | 207 +++++++++++++++++--------------
 src/nbl/video/ILogicalDevice.cpp |  21 ++--
 4 files changed, 147 insertions(+), 111 deletions(-)

diff --git a/examples_tests b/examples_tests
index 744dd44c3b..9897e115e7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 744dd44c3bd6d5bb5734402b85f49fd0e27a46cc
+Subproject commit 9897e115e726052662596ba6915c5438ebd51030
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 44b6766e40..022024e856 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -182,6 +182,18 @@ class CCUDAHandler : public core::IReferenceCounted
 			return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
 		}
 
+		struct SCUDADeviceInfo
+		{
+			CUdevice handle = {};
+			CUuuid uuid = {};
+			int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
+		};
+
+		inline core::vector<SCUDADeviceInfo> const& getAvailableDevices() const
+		{
+			return m_availableDevices;
+		}
+
 		//
 		inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange<const char* const> options)
 		{
@@ -217,6 +229,7 @@ class CCUDAHandler : public core::IReferenceCounted
 			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
 			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
 		}
+
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
@@ -225,6 +238,7 @@ class CCUDAHandler : public core::IReferenceCounted
 		{
 			return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
 		}
+
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
 			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
@@ -245,16 +259,8 @@ class CCUDAHandler : public core::IReferenceCounted
 
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 protected:
-		CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version)
-			: m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version)
-		{
-			for (auto& header : m_headers)
-			{
-				m_headerContents.push_back(reinterpret_cast<const char*>(header->getMappedPointer()));
-				m_headerNamesStorage.push_back(header->getFileName().string());
-				m_headerNames.push_back(m_headerNamesStorage.back().c_str());
-			}
-		}
+		CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
+
 		~CCUDAHandler() = default;
 
 
@@ -284,6 +290,8 @@ class CCUDAHandler : public core::IReferenceCounted
 		core::vector<const char*> m_headerNames;
 		system::logger_opt_smart_ptr m_logger;
 		int m_version;
+
+		core::vector<SCUDADeviceInfo> m_availableDevices;
 };
 
 }
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 09c2fbe14e..2789bed2a6 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -11,7 +11,49 @@
 
 namespace nbl::video
 {
-	
+
+CCUDAHandler::CCUDAHandler(
+	CUDA&& _cuda, 
+	NVRTC&& _nvrtc, 
+	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, 
+	core::smart_refctd_ptr<system::ILogger>&& _logger,
+	int _version)
+	: m_cuda(std::move(_cuda))
+	, m_nvrtc(std::move(_nvrtc))
+	, m_headers(std::move(_headers))
+	, m_logger(std::move(_logger))
+	, m_version(_version)
+{
+	for (auto& header : m_headers)
+	{
+		m_headerContents.push_back(reinterpret_cast<const char*>(header->getMappedPointer()));
+		m_headerNamesStorage.push_back(header->getFileName().string());
+		m_headerNames.push_back(m_headerNamesStorage.back().c_str());
+	}
+
+	int deviceCount = 0;
+	if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
+		return;
+
+	for (int ordinal = 0; ordinal < deviceCount; ordinal++)
+	{
+		CUdevice handle = -1;
+		if (m_cuda.pcuDeviceGet(&handle, ordinal) != CUDA_SUCCESS || handle < 0)
+			continue;
+
+		CUuuid uuid = {};
+		if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS)
+			continue;
+
+		m_availableDevices.emplace_back(handle, uuid);
+
+		int* attributes = m_availableDevices.back().attributes;
+		for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++)
+			m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
+
+	}
+}
+
 bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 {
 	switch (result)
@@ -527,110 +569,95 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 	if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end())
 		return nullptr;
 
-    int deviceCount = 0;
-    if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0)
-		return nullptr;
-
-    for (int ordinal=0; ordinal<deviceCount; ordinal++)
+   
+	for (auto& device : m_availableDevices)
 	{
-		CUdevice handle = -1;
-		if (m_cuda.pcuDeviceGet(&handle,ordinal)!=CUDA_SUCCESS || handle<0)
-			continue;
-
-		CUuuid uuid = {};
-		if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS)
-			continue;
-		
-        if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
+		if (!memcmp(&device.uuid, &physicalDevice->getProperties().deviceUUID, VK_UUID_SIZE))
 		{
-			int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
-			for (int i=0; i<CU_DEVICE_ATTRIBUTE_MAX; i++)
-				m_cuda.pcuDeviceGetAttribute(attributes+i,static_cast<CUdevice_attribute>(i),handle);
-
 			CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT;
-			const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR];
-			const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR];
+			const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR];
+			const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR];
 			switch (archMajor)
 			{
-				case 3:
-					switch (archMinor)
-					{
-						case 0:
-							arch = CCUDADevice::EVA_30;
-							break;
-						case 2:
-							arch = CCUDADevice::EVA_32;
-							break;
-						case 5:
-							arch = CCUDADevice::EVA_35;
-							break;
-						case 7:
-							arch = CCUDADevice::EVA_37;
-							break;
-						default:
-							break;
-					}
+			case 3:
+				switch (archMinor)
+				{
+				case 0:
+					arch = CCUDADevice::EVA_30;
 					break;
-				case 5:
-					switch (archMinor)
-					{
-						case 0:
-							arch = CCUDADevice::EVA_50;
-							break;
-						case 2:
-							arch = CCUDADevice::EVA_52;
-							break;
-						case 3:
-							arch = CCUDADevice::EVA_53;
-							break;
-						default:
-							break;
-					}
+				case 2:
+					arch = CCUDADevice::EVA_32;
 					break;
-				case 6:
-					switch (archMinor)
-					{
-						case 0:
-							arch = CCUDADevice::EVA_60;
-							break;
-						case 1:
-							arch = CCUDADevice::EVA_61;
-							break;
-						case 2:
-							arch = CCUDADevice::EVA_62;
-							break;
-						default:
-							break;
-					}
+				case 5:
+					arch = CCUDADevice::EVA_35;
 					break;
 				case 7:
-					switch (archMinor)
-					{
-						case 0:
-							arch = CCUDADevice::EVA_70;
-							break;
-						case 2:
-							arch = CCUDADevice::EVA_72;
-							break;
-						case 5:
-							arch = CCUDADevice::EVA_75;
-							break;
-						default:
-							break;
-					}
+					arch = CCUDADevice::EVA_37;
+					break;
+				default:
+					break;
+				}
+				break;
+			case 5:
+				switch (archMinor)
+				{
+				case 0:
+					arch = CCUDADevice::EVA_50;
+					break;
+				case 2:
+					arch = CCUDADevice::EVA_52;
+					break;
+				case 3:
+					arch = CCUDADevice::EVA_53;
 					break;
 				default:
-					if (archMajor>=8)
-						arch = CCUDADevice::EVA_80;
 					break;
+				}
+				break;
+			case 6:
+				switch (archMinor)
+				{
+				case 0:
+					arch = CCUDADevice::EVA_60;
+					break;
+				case 1:
+					arch = CCUDADevice::EVA_61;
+					break;
+				case 2:
+					arch = CCUDADevice::EVA_62;
+					break;
+				default:
+					break;
+				}
+				break;
+			case 7:
+				switch (archMinor)
+				{
+				case 0:
+					arch = CCUDADevice::EVA_70;
+					break;
+				case 2:
+					arch = CCUDADevice::EVA_72;
+					break;
+				case 5:
+					arch = CCUDADevice::EVA_75;
+					break;
+				default:
+					break;
+				}
+				break;
+			default:
+				if (archMajor >= 8)
+					arch = CCUDADevice::EVA_80;
+				break;
 			}
-			if (arch==CCUDADevice::EVA_COUNT)
+			if (arch == CCUDADevice::EVA_COUNT)
 				continue;
 
-			auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,handle,core::smart_refctd_ptr<CCUDAHandler>(this));
-            return core::smart_refctd_ptr<CCUDADevice>(device,core::dont_grab);
-        }
-    }
+			return core::smart_refctd_ptr<CCUDADevice>(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr<CCUDAHandler>(this)), core::dont_grab);
+		}
+	}
+	
 	return nullptr;
 }
 
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 5ac47d81d7..69460619fe 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -111,7 +111,7 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag
     return getSupportedStageMask(queueFamilyIndex).hasFlags(stageMask);
 }
 
-bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag<asset::ACCESS_FLAGS> stageMask) const
+bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag<asset::ACCESS_FLAGS> accesMask) const
 {
     if (queueFamilyIndex>m_queueFamilyInfos->size())
         return false;
@@ -119,15 +119,16 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag
     const auto& familyProps = m_physicalDevice->getQueueFamilyProperties()[queueFamilyIndex].queueFlags;
     const bool shaderCapableFamily = bool(familyProps&(q_family_flags_t::COMPUTE_BIT|q_family_flags_t::GRAPHICS_BIT));
     // strip special values
-    if (stageMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS))
-        stageMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS;
-    else if (stageMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily)
-        stageMask ^= asset::ACCESS_FLAGS::SHADER_READ_BITS;
-    if (stageMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_WRITE_BITS))
-        stageMask ^= asset::ACCESS_FLAGS::MEMORY_WRITE_BITS;
-    else if (stageMask.hasFlags(asset::ACCESS_FLAGS::SHADER_WRITE_BITS) && shaderCapableFamily)
-        stageMask ^= asset::ACCESS_FLAGS::SHADER_WRITE_BITS;
-    return getSupportedAccessMask(queueFamilyIndex).hasFlags(stageMask);
+    VK_ACCESS_SHADER_WRITE_BIT;
+    if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS))
+        accesMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS;
+    else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily)
+        accesMask ^= asset::ACCESS_FLAGS::SHADER_READ_BITS;
+    if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_WRITE_BITS))
+        accesMask ^= asset::ACCESS_FLAGS::MEMORY_WRITE_BITS;
+    else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_WRITE_BITS) && shaderCapableFamily)
+        accesMask ^= asset::ACCESS_FLAGS::SHADER_WRITE_BITS;
+    return getSupportedAccessMask(queueFamilyIndex).hasFlags(accesMask);
 }
 
 bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asset::SMemoryBarrier barrier) const

From 4e2185c1e3197aada9095c0ca32c46bcc01a1dda Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 20 Jan 2024 01:25:13 +0300
Subject: [PATCH 61/62] ifdef platform code

---
 examples_tests                              |   2 +-
 include/nbl/video/EApiType.h                |   9 ++
 include/nbl/video/IDeviceMemoryAllocation.h |   5 +-
 include/nbl/video/IDeviceMemoryBacked.h     |   2 -
 include/nbl/video/IPhysicalDevice.h         |   1 -
 include/nbl/video/ISemaphore.h              |   2 +-
 src/nbl/video/CCUDASharedMemory.cpp         |   8 --
 src/nbl/video/CVulkanImage.cpp              |   2 +
 src/nbl/video/CVulkanLogicalDevice.cpp      | 152 +++++++++++++++++---
 src/nbl/video/CVulkanMemoryAllocation.cpp   |   6 +
 src/nbl/video/ILogicalDevice.cpp            |   1 -
 11 files changed, 151 insertions(+), 39 deletions(-)

diff --git a/examples_tests b/examples_tests
index 9897e115e7..73f147941e 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 9897e115e726052662596ba6915c5438ebd51030
+Subproject commit 73f147941ef5362d0adee47ae72b4088b8c49aa5
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index e670dc90d8..275e3f0a7a 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -13,6 +13,15 @@ enum E_API_TYPE : uint32_t
     //EAT_WEBGPU
 };
 
+
+using ExternalHandleType =
+#ifdef _WIN32
+void*
+#else
+int
+#endif
+;
+
 }
 
 #endif
diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h
index 9ca663b9ea..d162a029be 100644
--- a/include/nbl/video/IDeviceMemoryAllocation.h
+++ b/include/nbl/video/IDeviceMemoryAllocation.h
@@ -172,7 +172,7 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
 			IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE;
 			//! Imports the given handle  if externalHandle != nullptr && externalHandleType != EHT_NONE
 			//! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE
-			void* externalHandle = nullptr;
+            ExternalHandleType externalHandle = 0;
         };
 
         struct SCreationParams: SInfo
@@ -180,6 +180,8 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
             core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE;
             const bool dedicated = false;
         };
+        
+        inline const SCreationParams& getCreationParams() const { return m_params; }
 
     protected:
         inline void setPostDestroyCleanup(std::unique_ptr<struct ICleanup>&& cleanup)
@@ -199,7 +201,6 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
         virtual void* map_impl(const MemoryRange& range, const core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> accessHint) = 0;
         virtual bool unmap_impl() = 0;
 
-
         const ILogicalDevice* m_originDevice = nullptr;
         SCreationParams m_params = {};
         uint8_t* m_mappedPtr = nullptr;
diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h
index 0071a53d71..c5c28ad717 100644
--- a/include/nbl/video/IDeviceMemoryBacked.h
+++ b/include/nbl/video/IDeviceMemoryBacked.h
@@ -123,11 +123,9 @@ class IDeviceMemoryBacked : public IBackendObject
             m_cachedCreationParams.preDestroyCleanup = nullptr;
         }
 
-
         //! members
         SCachedCreationParams m_cachedCreationParams;
         const SDeviceMemoryRequirements m_cachedMemoryReqs;
-        void* m_cachedExternalHandle = nullptr;
 };
 
 } // end namespace nbl::video
diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h
index e32a65d9f3..870a435f5e 100644
--- a/include/nbl/video/IPhysicalDevice.h
+++ b/include/nbl/video/IPhysicalDevice.h
@@ -28,7 +28,6 @@ namespace nbl::video
 
 
 
-
 class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable
 {
     template<class F> static constexpr bool is_bitflag = false;
diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h
index 5434591fb6..07506067af 100644
--- a/include/nbl/video/ISemaphore.h
+++ b/include/nbl/video/ISemaphore.h
@@ -68,7 +68,7 @@ class ISemaphore : public IBackendObject
             core::bitflag<E_EXTERNAL_HANDLE_TYPE> externalHandleTypes = EHT_NONE;
             //! Imports the given handle  if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE
             //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE
-            void* externalHandle = nullptr;
+            ExternalHandleType externalHandle = nullptr;
         };
 
         auto const& getCreationParams() const
diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp
index 6510967271..a5b8011920 100644
--- a/src/nbl/video/CCUDASharedMemory.cpp
+++ b/src/nbl/video/CCUDASharedMemory.cpp
@@ -10,14 +10,6 @@ namespace nbl::video
 
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
 {
-	IDeviceMemoryAllocator::SAllocateInfo info = {
-		{
-			.allocationSize = m_params.granularSize,
-			.externalHandleType = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE,
-			.externalHandle = m_params.osHandle,
-		}
-	};
-
 	auto pd = device->getPhysicalDevice();
 	uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1;
 	uint32_t vram = pd->getDeviceLocalMemoryTypeBits();
diff --git a/src/nbl/video/CVulkanImage.cpp b/src/nbl/video/CVulkanImage.cpp
index ff5e2dfcb9..72e9dc62fc 100644
--- a/src/nbl/video/CVulkanImage.cpp
+++ b/src/nbl/video/CVulkanImage.cpp
@@ -9,6 +9,8 @@ namespace nbl::video
 CVulkanImage::~CVulkanImage()
 {
     preDestroyStep();
+    // e.g. don't destroy imported handles from the same VkInstance (e.g. if hooking into external Vulkan codebase)
+    // truly EXTERNAL_MEMORY imported handles, do need to be destroyed + CloseHandled (separate thing)
     if (!m_cachedCreationParams.skipHandleDestroy)
     {
         const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index afedf60786..607aa69caa 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -47,12 +47,38 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr<const IAPIConn
 
 core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(uint64_t initialValue, ISemaphore::SCreationParams&& params)
 {
-    VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR };
-    VkExportSemaphoreWin32HandleInfoKHR handleInfo = { .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, .dwAccess = GENERIC_ALL };
-    VkExportSemaphoreCreateInfo  exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, &handleInfo, static_cast<VkExternalSemaphoreHandleTypeFlags>(params.externalHandleTypes.value) };
+#ifdef _WIN32
+    VkImportSemaphoreWin32HandleInfoKHR importInfo = { 
+        .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR,
+        .handleType = static_cast<VkExternalSemaphoreHandleTypeFlagBits>(params.externalHandleTypes.value),
+        .handle = params.externalHandle,
+    };
+    VkExportSemaphoreWin32HandleInfoKHR handleInfo = { 
+        .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, 
+        .dwAccess = GENERIC_ALL 
+    };
+#else
+    VkImportSemaphoreFdInfoKHR importInfo = { 
+        .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR,
+        .handleType = static_cast<VkExternalSemaphoreHandleTypeFlagBits>(params.externalHandleTypes.value),
+        .fd = params.externalHandle,
+    };
+#endif
+
+    VkExportSemaphoreCreateInfo  exportInfo = { 
+        .sType =  VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, 
+#ifdef _WIN32
+        .pNext = &handleInfo, 
+#endif
+        .handleTypes = static_cast<VkExternalSemaphoreHandleTypeFlags>(params.externalHandleTypes.value) 
+    };
+
+
+    const bool importing = params.externalHandleTypes.value && params.externalHandle;
+    const bool exporting = params.externalHandleTypes.value && !params.externalHandle;
 
     VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR };
-    type.pNext = params.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR
+    type.pNext = exporting ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR
     type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
     type.initialValue = initialValue;
 
@@ -63,18 +89,27 @@ core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(uint64_
     if (VK_SUCCESS != m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore))
         return nullptr;
 
-    if (params.externalHandleTypes.value)
+    VkSemaphoreGetWin32HandleInfoKHR props = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+        .semaphore = semaphore,
+        .handleType = static_cast<VkExternalSemaphoreHandleTypeFlagBits>(params.externalHandleTypes.value),
+    };
+    
+#ifdef _WIN32
+    auto importfn = m_devf.vk.vkImportSemaphoreWin32HandleKHR;
+    auto exportfn = m_devf.vk.vkGetSemaphoreWin32HandleKHR;
+#else
+    auto importfn = m_devf.vk.vkImportSemaphoreFdKHR;
+    auto exportfn = m_devf.vk.vkGetSemaphoreFdKHR;
+#endif
+
+    if (
+        (importing && (VK_SUCCESS != importfn(m_vkdev, &importInfo))) ||
+        (exporting && (VK_SUCCESS != exportfn(m_vkdev, &props, &params.externalHandle)))
+       )
     {
-        VkSemaphoreGetWin32HandleInfoKHR props = {
-            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
-            .semaphore = semaphore,
-            .handleType = static_cast<VkExternalSemaphoreHandleTypeFlagBits>(params.externalHandleTypes.value),
-        };
-        if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &params.externalHandle))
-        {
-            m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0);
-            return nullptr;
-        }
+        m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0);
+        return nullptr;
     }
 
     return core::make_smart_refctd_ptr<CVulkanSemaphore>(core::smart_refctd_ptr<CVulkanLogicalDevice>(this), semaphore, std::move(params));
@@ -143,12 +178,28 @@ core::smart_refctd_ptr<IDeferredOperation> CVulkanLogicalDevice::createDeferredO
     return core::smart_refctd_ptr<CVulkanDeferredOperation>(reinterpret_cast<CVulkanDeferredOperation*>(memory),core::dont_grab);
 }
 
+void* DupeHandle(uint64_t pid, void* handle)
+{
+#ifdef _WIN32
+    DWORD flags;
+    HANDLE re = 0;
+
+    HANDLE cur = GetCurrentProcess();
+    HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur;
+
+    if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS))
+        return 0;
+
+    CloseHandle(src);
+    return re;
+#endif
+    return handle;
+}
 
 IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info)
 {
-    IDeviceMemoryAllocator::SAllocation ret = {};
     if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount)
-        return ret;
+        return {};
 
     VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr };
     {
@@ -161,6 +212,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
     vk_allocateInfo.allocationSize = info.allocationSize;
     vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex;
 
+#ifdef _WIN32
     VkImportMemoryWin32HandleInfoKHR importInfo = { 
         .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR,
         .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(info.externalHandleType),
@@ -171,10 +223,19 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
         .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR,
         .dwAccess = GENERIC_ALL,
     };
+#else
+    VkImportMemoryFdInfoKHR importInfo = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+        .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(info.externalHandleType),
+        .fd = (int)info.externalHandle,
+    };
+#endif
 
     VkExportMemoryAllocateInfo exportInfo = {
         .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
-        .pNext = &exportInfo,
+#ifdef _WIN32
+        .pNext = &handleInfo,
+#endif
         .handleTypes = static_cast<VkExternalMemoryHandleTypeFlags>(info.externalHandleType),
     };
     
@@ -183,7 +244,11 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
     if (info.externalHandleType)
     {
         if (info.externalHandle) //importing
+        {
+            auto duped = DupeHandle(0, info.externalHandle);
+            const_cast<void*&>(info.externalHandle) = duped;
             *pNext = &importInfo;
+        }
         else // exporting
             *pNext = &exportInfo;
         pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext;
@@ -206,7 +271,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
                 break;
             default:
                 assert(false);
-                return ret;
+                return {};
                 break;
         }
     }
@@ -214,15 +279,57 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca
     VkDeviceMemory vk_deviceMemory;
     auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory);
     if (vk_res!=VK_SUCCESS)
-        return ret;
+        return {};
+    
+    const bool exported = info.externalHandleType && !info.externalHandle;
+
+    if (exported)
+    {
+#ifdef _WIN32
+        VkMemoryGetWin32HandleInfoKHR 
+#else
+        VkMemoryGetFdInfoKHR
+#endif
+        handleInfo = { .sType = 
+#ifdef _WIN32
+            VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+#else 
+            VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+#endif
+            .memory = vk_deviceMemory,
+            .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(info.externalHandleType),
+        };
+
+        /*
+            For handle types defined as NT handles, 
+            the handles returned by vkGetMemoryWin32HandleKHR are owned by the application 
+            and hold a reference to their payload. To avoid leaking resources, 
+            the application must release ownership of them 
+            using the CloseHandle system call when they are no longer needed.
+        */
+
+        if (VK_SUCCESS != m_devf.vk.
+#ifdef _WIN32
+            vkGetMemoryWin32HandleKHR
+#else
+            vkGetMemoryFdKHR
+#endif
+            (m_vkdev, &handleInfo, const_cast<ExternalHandleType*>(&info.externalHandle)))
+        {
+            m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0);
+            return {};
+        }
+
+    }
 
     // automatically allocation goes out of scope and frees itself if no success later on
     const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags;
 
     CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication };
-      
-    ret.memory = core::make_smart_refctd_ptr<CVulkanMemoryAllocation>(this,vk_deviceMemory, std::move(params));
+    IDeviceMemoryAllocator::SAllocation ret = {};
+    ret.memory = core::make_smart_refctd_ptr<CVulkanMemoryAllocation>(this, vk_deviceMemory, std::move(params));
     ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator
+
     if(info.dedication)
     {
         bool dedicationSuccess = false;
@@ -349,7 +456,6 @@ core::smart_refctd_ptr<IGPUBuffer> CVulkanLogicalDevice::createBuffer_impl(IGPUB
        .handleTypes = creationParams.externalHandleTypes.value,
     };
 
-
     vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr;
     vk_createInfo.flags = static_cast<VkBufferCreateFlags>(0u); // Nabla doesn't support any of these flags
     vk_createInfo.size = static_cast<VkDeviceSize>(creationParams.size);
diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp
index fb214c897e..7597e33717 100644
--- a/src/nbl/video/CVulkanMemoryAllocation.cpp
+++ b/src/nbl/video/CVulkanMemoryAllocation.cpp
@@ -16,6 +16,12 @@ CVulkanMemoryAllocation::CVulkanMemoryAllocation(
 
 CVulkanMemoryAllocation::~CVulkanMemoryAllocation()
 {
+    if (m_params.externalHandle)
+    {
+        bool re = CloseHandle(getCreationParams().externalHandle);
+        assert(re);
+    }
+
     m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr);
 }
 
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 69460619fe..2902ff7509 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -119,7 +119,6 @@ bool ILogicalDevice::supportsMask(const uint32_t queueFamilyIndex, core::bitflag
     const auto& familyProps = m_physicalDevice->getQueueFamilyProperties()[queueFamilyIndex].queueFlags;
     const bool shaderCapableFamily = bool(familyProps&(q_family_flags_t::COMPUTE_BIT|q_family_flags_t::GRAPHICS_BIT));
     // strip special values
-    VK_ACCESS_SHADER_WRITE_BIT;
     if (accesMask.hasFlags(asset::ACCESS_FLAGS::MEMORY_READ_BITS))
         accesMask ^= asset::ACCESS_FLAGS::MEMORY_READ_BITS;
     else if (accesMask.hasFlags(asset::ACCESS_FLAGS::SHADER_READ_BITS) && shaderCapableFamily)

From bd0b76a341d86034474cbf67c266a9b25e80e81d Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 20 Jan 2024 01:31:58 +0300
Subject: [PATCH 62/62] log queue validation warning

---
 include/nbl/video/IQueue.h | 19 +++++++++++++------
 src/nbl/video/IQueue.cpp   |  9 ++++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h
index 654d95a847..4000fcd2a3 100644
--- a/include/nbl/video/IQueue.h
+++ b/include/nbl/video/IQueue.h
@@ -68,7 +68,7 @@ class IQueue : public core::Interface, public core::Unmovable
         {
             SUCCESS,
             DEVICE_LOST,
-            OTHER_ERROR
+            OTHER_ERROR,
         };
         //
         struct SSubmitInfo
@@ -92,16 +92,23 @@ class IQueue : public core::Interface, public core::Unmovable
             std::span<const SCommandBufferInfo> commandBuffers = {};
             std::span<const SSemaphoreInfo> signalSemaphores = {};
 
-            inline bool valid() const
+            enum Validity
+            {
+                INVALID,
+                VALID,
+                WORK_WITHOUT_SYNC,
+            };
+
+            inline Validity valid() const
             {
                 // any two being empty is wrong
                 if (commandBuffers.empty() && signalSemaphores.empty()) // wait and do nothing
-                    return false;
+                    return INVALID;
                 if (waitSemaphores.empty() && signalSemaphores.empty()) // work without sync
-                    return false;
+                   return WORK_WITHOUT_SYNC;
                 if (waitSemaphores.empty() && commandBuffers.empty()) // signal without doing work first
-                    return false;
-                return true;
+                    return INVALID;
+                return VALID;
             }
         };
         virtual RESULT submit(const std::span<const SSubmitInfo> _submits);
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index e75e7b2cad..2527562bac 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -13,8 +13,15 @@ auto IQueue::submit(const std::span<const SSubmitInfo> _submits) -> RESULT
     auto* logger = m_originDevice->getPhysicalDevice()->getDebugCallback()->getLogger();
     for (const auto& submit : _submits)
     {
-        if (!submit.valid())
+        switch (submit.valid())
+        {
+        case SSubmitInfo::INVALID:
             return RESULT::OTHER_ERROR;
+        case SSubmitInfo::WORK_WITHOUT_SYNC:
+            logger->log("Work withouth sync!", system::ILogger::ELL_WARNING);
+        default:
+            break;
+        }
 
         auto invalidSemaphores = [this,logger](const std::span<const SSubmitInfo::SSemaphoreInfo> semaphoreInfos) -> bool
         {