diff --git a/CMakeLists.txt b/CMakeLists.txt index c64700ce482ba..5545231c5bccf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -442,7 +442,7 @@ option(IREE_BYTECODE_MODULE_ENABLE_TSAN "Enable thread sanitizer in IREE modules option(IREE_ENABLE_UBSAN "Enable undefined behavior sanitizer" OFF) option(IREE_ENABLE_SPLIT_DWARF "Enable gsplit-dwarf for debug information if the platform supports it" OFF) option(IREE_ENABLE_THIN_ARCHIVES "Enables thin ar archives (elf systems only). Disable for released static archives" OFF) -option(IREE_LINK_COMPILER_SHARED_LIBRARY "Links IREE tools using the compiler compiled into a shared library" ON) +option(IREE_LINK_COMPILER_SHARED_LIBRARY "Links IREE tools using the compiler compiled into a shared library" OFF) # STREQUAL feels wrong here - we don't care about the exact true-value used, # ON or TRUE or something else. But we haven't been able to think of a less bad diff --git a/build_tools/scripts/check_vulkan.sh b/build_tools/scripts/check_vulkan.sh index b4d3bc52220de..9624f10d06d42 100755 --- a/build_tools/scripts/check_vulkan.sh +++ b/build_tools/scripts/check_vulkan.sh @@ -24,3 +24,6 @@ fi echo "${VULKAN_INSTANCE?}" echo "${VK_PHYSICAL_DEVICE_PROPERTIES?}" + +cat /tmp/vulkaninfo.stdout +cat /tmp/vulkaninfo.stderr diff --git a/runtime/src/iree/hal/drivers/vulkan/base_buffer.c b/runtime/src/iree/hal/drivers/vulkan/base_buffer.c index 7ed6986968d5d..eefed8aa22168 100644 --- a/runtime/src/iree/hal/drivers/vulkan/base_buffer.c +++ b/runtime/src/iree/hal/drivers/vulkan/base_buffer.c @@ -45,12 +45,15 @@ iree_status_t iree_hal_vulkan_find_memory_type( iree_hal_memory_type_t requested_type = params->type; if (device_props->deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) { + fprintf(stderr, "iree_hal_vulkan_find_memory_type integrated gpu\n"); // Integrated GPUs have tiny device local heaps commonly used for // framebuffers and other bounded resources. We don't currently try to use // them but could for very small transients. if (iree_all_bits_set(requested_type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { requested_type &= ~IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL; requested_type |= IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE; + fprintf(stderr, + "iree_hal_vulkan_find_memory_type flip to device visible\n"); } } @@ -61,30 +64,40 @@ iree_status_t iree_hal_vulkan_find_memory_type( // Device-local, host-visible. require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; prefer_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + fprintf(stderr, + "iree_hal_vulkan_find_memory_type req host visible, pref device " + "local\n"); } else { // Device-local only. require_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + fprintf(stderr, "iree_hal_vulkan_find_memory_type device local only\n"); } } else { if (iree_all_bits_set(requested_type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { // Host-local, device-visible. require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + fprintf(stderr, + "iree_hal_vulkan_find_memory_type host local device visible\n"); } else { // Host-local only. require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + fprintf(stderr, "iree_hal_vulkan_find_memory_type host visible only\n"); } } if (iree_all_bits_set(requested_type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { require_flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + fprintf(stderr, "iree_hal_vulkan_find_memory_type host cached\n"); } if (iree_all_bits_set(requested_type, IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) { require_flags |= VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + fprintf(stderr, "iree_hal_vulkan_find_memory_type host coherent\n"); } if (iree_any_bit_set(requested_type, IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED | IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT)) { require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + fprintf(stderr, "iree_hal_vulkan_find_memory_type mapping req\n"); } int most_bits_count = 0; @@ -95,6 +108,7 @@ iree_status_t iree_hal_vulkan_find_memory_type( !iree_hal_vulkan_is_memory_type_usable(flags) || !iree_all_bits_set(allowed_type_indices, 1u << i)) { // Excluded (required bits missing or memory type is not usable). + fprintf(stderr, "iree_hal_vulkan_find_memory_type excluding bit %u\n", i); continue; } // When all required bits are satisfied try to find the memory type that @@ -103,9 +117,13 @@ iree_status_t iree_hal_vulkan_find_memory_type( if (most_bits_idx == -1) { most_bits_count = bit_count; most_bits_idx = (int)i; + fprintf(stderr, + "iree_hal_vulkan_find_memory_type first bit found at %u\n", i); } else if (bit_count > most_bits_count) { most_bits_count = bit_count; most_bits_idx = (int)i; + fprintf(stderr, "iree_hal_vulkan_find_memory_type better found at %u\n", + i); } } if (most_bits_idx == -1) { diff --git a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc index b6352448e8690..a5b81bf379db1 100644 --- a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc +++ b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc @@ -268,6 +268,8 @@ static iree_status_t iree_hal_vulkan_native_allocator_commit_and_wrap( VkMemoryRequirements requirements = {0}; logical_device->syms()->vkGetBufferMemoryRequirements(*logical_device, handle, &requirements); + fprintf(stderr, "vkGetBufferMemoryRequirements alignment %u bits %08X\n", + (uint32_t)requirements.alignment, requirements.memoryTypeBits); uint32_t memory_type_index = 0; IREE_RETURN_IF_ERROR(iree_hal_vulkan_find_memory_type( &allocator->device_props, &allocator->memory_props, params, @@ -287,13 +289,25 @@ static iree_status_t iree_hal_vulkan_native_allocator_commit_and_wrap( allocator->device_props_11.maxMemoryAllocationSize, out_buffer); } + fprintf(stderr, "vkAllocateMemory\nreq size %u\nmemory type index %u\n", + (uint32_t)requirements.size, memory_type_index); + + iree_device_size_t aligned_size = + iree_device_align(allocation_size, 16 * 1024); + // Allocate the device memory we'll attach the buffer to. VkMemoryAllocateInfo allocate_info = {}; allocate_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; allocate_info.pNext = NULL; - allocate_info.allocationSize = requirements.size; + allocate_info.allocationSize = iree_max(aligned_size, requirements.size); allocate_info.memoryTypeIndex = memory_type_index; VkDeviceMemory device_memory = VK_NULL_HANDLE; + fprintf(stderr, + "vkAllocateMemory(%p, allocationSize=%" PRIu64 + ", memoryTypeIndex=%u, pNext=%p, %p, %p)\n", + (void*)logical_device->value(), + (uint64_t)allocate_info.allocationSize, allocate_info.memoryTypeIndex, + allocate_info.pNext, logical_device->allocator(), &device_memory); VK_RETURN_IF_ERROR(logical_device->syms()->vkAllocateMemory( *logical_device, &allocate_info, logical_device->allocator(), &device_memory), @@ -307,7 +321,7 @@ static iree_status_t iree_hal_vulkan_native_allocator_commit_and_wrap( internal_release_callback.user_data = NULL; iree_status_t status = iree_hal_vulkan_native_buffer_wrap( (iree_hal_allocator_t*)allocator, params->type, params->access, - params->usage, allocation_size, + params->usage, aligned_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, logical_device, device_memory, handle, internal_release_callback, iree_hal_buffer_release_callback_null(), diff --git a/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc b/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc index c1889c03b41fe..af6f6d2efb27b 100644 --- a/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc +++ b/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc @@ -261,6 +261,19 @@ static void VKAPI_PTR iree_hal_vulkan_vma_free_callback( static void iree_hal_vulkan_vma_allocator_destroy( iree_hal_allocator_t* IREE_RESTRICT base_allocator); +static PFN_vkAllocateMemory allocate_memory_ptr = NULL; +static VkResult HOOKED_vkAllocateMemory( + VkDevice device, const VkMemoryAllocateInfo* pAllocateInfo, + const VkAllocationCallbacks* pAllocator, VkDeviceMemory* pMemory) { + fprintf(stderr, + "vkAllocateMemory(%p, allocationSize=%" PRIu64 + ", memoryTypeIndex=%u, pNext=%p, %p, %p)\n", + (void*)device, (uint64_t)pAllocateInfo->allocationSize, + pAllocateInfo->memoryTypeIndex, pAllocateInfo->pNext, pAllocator, + pMemory); + return allocate_memory_ptr(device, pAllocateInfo, pAllocator, pMemory); +} + iree_status_t iree_hal_vulkan_vma_allocator_create( const iree_hal_vulkan_device_options_t* options, VkInstance instance, VkPhysicalDevice physical_device, VkDeviceHandle* logical_device, @@ -287,7 +300,8 @@ iree_status_t iree_hal_vulkan_vma_allocator_create( syms->vkGetPhysicalDeviceProperties; vulkan_fns.vkGetPhysicalDeviceMemoryProperties = syms->vkGetPhysicalDeviceMemoryProperties; - vulkan_fns.vkAllocateMemory = syms->vkAllocateMemory; + allocate_memory_ptr = syms->vkAllocateMemory; + vulkan_fns.vkAllocateMemory = HOOKED_vkAllocateMemory; vulkan_fns.vkFreeMemory = syms->vkFreeMemory; vulkan_fns.vkMapMemory = syms->vkMapMemory; vulkan_fns.vkUnmapMemory = syms->vkUnmapMemory; diff --git a/runtime/src/iree/hal/drivers/vulkan/vma_impl.h b/runtime/src/iree/hal/drivers/vulkan/vma_impl.h index 7204959f85e77..60ab45dc4c080 100644 --- a/runtime/src/iree/hal/drivers/vulkan/vma_impl.h +++ b/runtime/src/iree/hal/drivers/vulkan/vma_impl.h @@ -18,6 +18,12 @@ // to be omitted and not have VMA poking around where it shouldn't. #define VMA_DYNAMIC_VULKAN_FUNCTIONS 0 +#define VMA_DEBUG_LOG_FORMAT(format, ...) \ + do { \ + fprintf(stderr, (format), __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } while (false) + #include // IWYU pragma: export #endif // IREE_HAL_DRIVERS_VULKAN_VMA_IMPL_H_ diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc index 7dca24bae2513..6b4b72eaf2628 100644 --- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc +++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc @@ -527,7 +527,7 @@ static iree_hal_vulkan_device_t* iree_hal_vulkan_device_cast( IREE_API_EXPORT void iree_hal_vulkan_device_options_initialize( iree_hal_vulkan_device_options_t* out_options) { memset(out_options, 0, sizeof(*out_options)); - out_options->flags = IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR; + out_options->flags = 0; // IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR; out_options->large_heap_block_size = 64 * 1024 * 1024; } diff --git a/tests/e2e/models/BUILD.bazel b/tests/e2e/models/BUILD.bazel index f80dd3371f09c..171609cad3ec5 100644 --- a/tests/e2e/models/BUILD.bazel +++ b/tests/e2e/models/BUILD.bazel @@ -23,7 +23,6 @@ iree_lit_test_suite( [ "collatz.mlir", "edge_detection.mlir", - "fragment_000.mlir", "fullyconnected.mlir", "mnist_fake_weights.mlir", "unidirectional_lstm.mlir", diff --git a/tests/e2e/models/CMakeLists.txt b/tests/e2e/models/CMakeLists.txt index 3390136c7839f..486d69d295afb 100644 --- a/tests/e2e/models/CMakeLists.txt +++ b/tests/e2e/models/CMakeLists.txt @@ -16,7 +16,6 @@ iree_lit_test_suite( SRCS "collatz.mlir" "edge_detection.mlir" - "fragment_000.mlir" "fullyconnected.mlir" "mnist_fake_weights.mlir" "unidirectional_lstm.mlir" diff --git a/tests/e2e/models/fragment_000.mlir b/tests/e2e/models/fragment_000.mlir deleted file mode 100644 index aacdd4b277fde..0000000000000 --- a/tests/e2e/models/fragment_000.mlir +++ /dev/null @@ -1,39 +0,0 @@ -// RUN: iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=vmvx %s | FileCheck %s -// RUN: iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=llvm-cpu %s | FileCheck %s -// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=vulkan-spirv %s | FileCheck %s) -// RUN: [[ $IREE_METAL_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=metal-spirv %s | FileCheck %s) - -// CHECK-LABEL: EXEC @entry -func.func @entry() -> tensor<5x5xf32> { - %0 = util.unfoldable_constant dense<0.000000e+00> : tensor - %1 = util.unfoldable_constant dense<[[1.000000e+00], [-2.000000e+00], [-3.000000e+00], [4.000000e+00], [-5.000000e+00]]> : tensor<5x1xf32> - %2 = util.unfoldable_constant dense<1.000000e+00> : tensor - %3 = util.unfoldable_constant dense<[[3.464990e+00, -7.643890e+00, -5.722490e+00, 5.98052978, 1.768920e+01], [2.970700e+00, -6.207340e+00, -4.259620e+00, 4.760550e+00, 1.387840e+01], [2.476410e+00, -4.770790e+00, -2.796750e+00, 3.540560e+00, 1.006750e+01], [1.982120e+00, -3.334240e+00, -1.333880e+00, 2.320580e+00, 6.256660e+00], [1.487830e+00, -1.897700e+00, 1.289900e-01, 1.100600e+00, 2.445800e+00]]> : tensor<5x5xf32> - %4 = util.unfoldable_constant dense<0.000000e+00> : tensor<5xf32> - %5 = call @_entry(%0, %1, %2, %3, %4) : (tensor, tensor<5x1xf32>, tensor, tensor<5x5xf32>, tensor<5xf32>) -> tensor<5x5xf32> - return %5 : tensor<5x5xf32> -} -func.func private @_entry(%arg0: tensor, %arg1: tensor<5x1xf32>, %arg2: tensor, %arg3: tensor<5x5xf32>, %arg4: tensor<5xf32>) -> tensor<5x5xf32> { - %0 = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] {name = "broadcast.44"} : (tensor<5x1xf32>) -> tensor<5x1x5xf32> - %1 = stablehlo.broadcast_in_dim %arg2, dims = [] {name = "broadcast.9"} : (tensor) -> tensor<5x1x5xf32> - %2 = stablehlo.multiply %0, %1 : tensor<5x1x5xf32> - %3 = stablehlo.broadcast_in_dim %arg0, dims = [] {name = "broadcast.47"} : (tensor) -> tensor<5x1x5xf32> - %4 = stablehlo.compare GT, %2, %3 : (tensor<5x1x5xf32>, tensor<5x1x5xf32>) -> tensor<5x1x5xi1> - %5 = stablehlo.broadcast_in_dim %arg0, dims = [] {name = "broadcast.11"} : (tensor) -> tensor<5x1x5xf32> - %6 = stablehlo.broadcast_in_dim %arg0, dims = [] {name = "broadcast.67"} : (tensor) -> tensor<5x5xf32> - %7 = stablehlo.broadcast_in_dim %arg4, dims = [1] {name = "broadcast.64"} : (tensor<5xf32>) -> tensor<5x5xf32> - %8 = stablehlo.add %arg3, %7 : tensor<5x5xf32> - %9 = stablehlo.maximum %6, %8 {name = "maximum.68"} : tensor<5x5xf32> - %10 = stablehlo.reshape %9 {name = "reshape.70"} : (tensor<5x5xf32>) -> tensor<5x1x5xf32> - %11 = stablehlo.select %4, %5, %10 {name = "select.71"} : tensor<5x1x5xi1>, tensor<5x1x5xf32> - %12 = stablehlo.reshape %11 {name = "reshape.72"} : (tensor<5x1x5xf32>) -> tensor<5x5xf32> - return %12 : tensor<5x5xf32> -} - -// On separate lines to avoid "[[" which FileCheck interprets as substitutions -// CHECK: 5x5xf32= -// CHECK-SAME: [0 0 0 0 0] -// CHECK-SAME: [2.97{{[0-9]+}} 0 0 4.76{{[0-9]+}} 13.87{{[0-9]+}}] -// CHECK-SAME: [2.47{{[0-9]+}} 0 0 3.54{{[0-9]+}} 10.06{{[0-9]+}}] -// CHECK-SAME: [0 0 0 0 0] -// CHECK-SAME: [1.48{{[0-9]+}} 0 0.12{{[0-9]+}} 1.10{{[0-9]+}} 2.44{{[0-9]+}}] diff --git a/tests/e2e/models/mnist_train_test/CMakeLists.txt b/tests/e2e/models/mnist_train_test/CMakeLists.txt index a2ff8eff5ee03..b75508aa16f92 100644 --- a/tests/e2e/models/mnist_train_test/CMakeLists.txt +++ b/tests/e2e/models/mnist_train_test/CMakeLists.txt @@ -40,6 +40,18 @@ if(IREE_TARGET_BACKEND_CUDA AND IREE_HAL_DRIVER_CUDA) endif() if(IREE_TARGET_BACKEND_VULKAN_SPIRV AND IREE_HAL_DRIVER_VULKAN) + iree_py_test( + NAME + mnist_train_test_vulkan_vma + SRCS + "mnist_train_test.py" + ARGS + "--target_backend=vulkan-spirv" + "--driver=vulkan" + "--vma" + LABELS + "driver=vulkan" + ) iree_py_test( NAME mnist_train_test_vulkan diff --git a/tests/e2e/models/mnist_train_test/mnist_train_test.py b/tests/e2e/models/mnist_train_test/mnist_train_test.py index 47334ac6fe523..57671731a1f6d 100644 --- a/tests/e2e/models/mnist_train_test/mnist_train_test.py +++ b/tests/e2e/models/mnist_train_test/mnist_train_test.py @@ -17,6 +17,7 @@ import numpy as np from iree.compiler.tools import InputType, compile_file +from iree import runtime as rt from iree.runtime import load_vm_flatbuffer_file MODEL_ARTIFACTS_URL = "https://storage.googleapis.com/iree-model-artifacts/mnist_train.a49ba1535a45ac0f3e6be22a7ed5dddf4a53cd1f41126af938f0667b998f8e11.tar" @@ -76,6 +77,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--target_backend", type=str, default="llvm-cpu") parser.add_argument("--driver", type=str, default="local-task") + parser.add_argument("--vma", default=False, action="store_true") return parser.parse_known_args() @@ -115,6 +117,13 @@ def extract_test_data(archive_path: str, out_dir: str): class MnistTrainTest(unittest.TestCase): def test_mnist_training(self): + if args.vma: + rt.flags.parse_flags("--vulkan_vma_allocator=true") + else: + rt.flags.parse_flags("--vulkan_vma_allocator=false") + rt.flags.parse_flags("--vulkan_validation_layers=true") + rt.flags.parse_flags("--vulkan_debug_utils=true") + rt.flags.parse_flags("--vulkan_debug_verbosity=4") with tempfile.TemporaryDirectory() as tmp_dir: archive_path = os.path.join(tmp_dir, "mnist_train.tar") download_test_data(archive_path) diff --git a/tests/e2e/tensor_ops/tensor_cast.mlir b/tests/e2e/tensor_ops/tensor_cast.mlir index 13217b9ce14a3..95a3b36480af2 100644 --- a/tests/e2e/tensor_ops/tensor_cast.mlir +++ b/tests/e2e/tensor_ops/tensor_cast.mlir @@ -1,6 +1,7 @@ // RUN: iree-run-mlir --Xcompiler,iree-hal-target-backends=llvm-cpu %s | FileCheck %s // RUN: [[ $IREE_VMVX_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vmvx %s | FileCheck %s) -// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vulkan-spirv %s | FileCheck %s) +// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vulkan-spirv --trace_execution --vulkan_debug_verbosity=4 --vulkan_debug_utils=true --vulkan_validation_layers=true --vulkan_vma_allocator=true %s | FileCheck %s) +// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vulkan-spirv --trace_execution --vulkan_debug_verbosity=4 --vulkan_debug_utils=true --vulkan_validation_layers=true %s | FileCheck %s) func.func @tensor_cast() -> tensor<2x?xf32> { %input = util.unfoldable_constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>