This is a squash of the commits below.

commit 4a9db43abe38ce7a840d3f8ad830a69148af243c Author: Tarun Prabhu <tarun.prabhu@gmail.com> Date: Thu Aug 1 17:19:51 2024 -0600 Fix issues introduced after merge with intersect commit f29421607d362fd431ed3fc029cf32afce15a049 Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Tue May 7 09:22:58 2024 -0600 moved the intersect experiment to its own directory. commit b1aad99111f3145b8f6e1581ad0776501b31a4a6 Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Mon May 6 16:30:46 2024 -0600 Tweak makefile to match details and remove hard-coded gpu target. commit 9c74e29e278039a9ec4724966e1861fc1c9f45c2 Author: Danny Shevitz <shevitz@lanl.gov> Date: Mon May 6 11:32:30 2024 -0600 cleaned up intersect commit 2fcd7517b0dd3aab1e1972edb60600b79c3f96da Author: Danny Shevitz <shevitz@lanl.gov> Date: Thu Apr 11 09:56:01 2024 -0600 prior to merge uncommented in intersect commit f02cc1a94be95a2521af8266616ab7398c228acd Author: Danny Shevitz <shevitz@lanl.gov> Date: Fri Mar 29 09:38:53 2024 -0600 trapping the multi-target cuda stream error commit a6e6f766db0c97336015585380eb0e634262c329 Author: Danny Shevitz <shevitz@lanl.gov> Date: Wed Mar 27 11:10:03 2024 -0600 intersect is sort of working commit efbe047fa1878bed3084af8d7afdaa05f1d57c41 Author: Danny Shevitz <shevitz@lanl.gov> Date: Wed Mar 6 09:33:09 2024 -0700 At the moment, no LTO on intersect commit f31f2d4c0b4a974a633da1818ca6703727c21592 Author: Danny Shevitz <shevitz@lanl.gov> Date: Tue Mar 5 10:41:39 2024 -0700 prior to pulling, trying to get intersect working with LTO commit 2d8f485d641ccf04b3d3120a076f0ea534c30caf Author: Danny Shevitz <shevitz@lanl.gov> Date: Wed Feb 21 13:30:36 2024 -0700 modified the kokkos makefile so it finds the patched kokkos and added support for intersect by changing the recognized has kokkos flag commit 3e498f87a96b1fe32408cc74d54d54980462b45b Author: Danny Shevitz <shevitz@lanl.gov> Date: Mon Feb 12 11:38:34 2024 -0700 working on make multi-target/intersect build commit c0f771099c52a81cd6a41a5e7701fb1e9aa0b6b5 Author: Danny Shevitz <shevitz@lanl.gov> Date: Thu Feb 1 11:08:32 2024 -0700 Revert "fixed a typo in the makefiles" This reverts commit 0404e2fc5cbcd5ed8bc20a194ed056ef6bd06521. commit 227d32de1adbd813d645466b1258bc69be5e5c93 Author: Danny Shevitz <shevitz@lanl.gov> Date: Thu Feb 1 10:58:43 2024 -0700 updated cuda.mk commit 592a8ec6c79b2ad4f519d24d00cb229dced06b34 Author: Danny Shevitz <shevitz@lanl.gov> Date: Wed Oct 18 13:39:31 2023 -0600 fixed a typo in the makefiles commit 3588c8d357a2bdff0284724f1c9eea65711cad1d Author: Danny Shevitz <shevitz@lanl.gov> Date: Thu Feb 1 09:13:32 2024 -0700 finished merge with 16.x commit 474d653e39d478ed58c66e2e73058861941be3fc Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Fri Apr 12 10:35:38 2024 -0600 Disabled sync region optiizations (merging) due to issues with multi-target code. While this could have performance implications in some situations it is the only way we can avoid errors with mixed threaded and GPU code. More bugs may be lurking. Also includes updates to the runtime to deal with exposing GPU streams to the calling stack frame for correctly handling continuations. commit 35d70b993a1960e7c0399821f8f4ada89f1c25aa Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Thu Apr 4 16:30:40 2024 -0600 First attempt at fixing stream assignment from the runtime in a manner that GPU streams can be better captured (e.g., opencilk continuations) and GPU work can be launched and sync'ed by different host threads; this addresses a bug (flawed assumption) in the runtime when it comes to multi-target support and interoperability. commit 9e44d1189672d0c03c5744c873da51dba64b6588 Author: Patrick McCormick <> Date: Thu Mar 28 16:43:36 2024 -0600 fix bad context mistake -- relevant to multi-target thread-streams debugging... this is a temporary workaround and not a correctness guarantee for behaving well when opencilk and cuda targets are intermixed (it most certainly can also have performance implications). commit b74b28f5847b1d736870d7e6e50e33389518a363 Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Tue Mar 26 17:10:34 2024 -0600 extra verbose mode details on thread-stream creation. commit e8d59a79e712d638ae11513bb466a1674653dfe0 Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Tue Mar 26 17:04:16 2024 -0600 Quick thread-stream tweak (warning message update and context-based sync fallback). A few other odds and ends of cleanup. commit 6edc88e43175194bcf44b2846b92d2e20a8212ce Author: Tarun Prabhu <tarun.prabhu@gmail.com> Date: Thu Aug 1 16:44:25 2024 -0600 Undo change introduced by cherry picking commit. commit 0685a47c9d8a28dd96489203a227dc79b955392b Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Mon Mar 4 14:39:32 2024 -0700 A bit more rt feedback about libdl and some testing with rpath stuff in cmake. commit 37dbed14e0c5d04c36e6fc0fdc00d6336308602c Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Thu Feb 29 15:54:08 2024 -0700 Small fixes build logic (for no profiling) and nvidia cuda compute versions at runtime. commit 10f30e22788d3e9de75ce8129adab22139c8dd7f Author: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Wed Feb 21 10:40:29 2024 -0700 LTO fixes (opencilk bitcode file and auto-link args for tapir opencilk targets). Removed pure-kokkos tests as part of the default target set from all the experiments. Misc. clean up w/ experiments (e.g., makefiles), added LTO test, etc.
tarunprabhu · Aug 1, 2024 · c88b8c0 · c88b8c0
1 parent 640c85d
commit c88b8c0
Show file tree

Hide file tree

Showing 43 changed files with 1,561 additions and 3,559 deletions.
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -290,9 +290,12 @@ static void renderTapirLoweringOptions(const ArgList &Args,
                                        ArgStringList &CmdArgs,
                                        const ToolChain &TC) {
   if (Args.hasArg(options::OPT_ftapir_EQ)) {
-    if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ))
+    if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ)) {
       CmdArgs.push_back(Args.MakeArgString(
           Twine("--plugin-opt=tapir-target=") + A->getValue()));
+      if (std::string(A->getValue()) == std::string("opencilk"))
+        TC.AddOpenCilkABIBitcode(Args, CmdArgs, /*IsLTO=*/true);
+    }
   }
 }
 

diff --git a/kitsune/experiments/copy/makefile b/kitsune/experiments/copy/makefile
@@ -4,12 +4,10 @@ targets = copy-forall.opencilk.${host_arch}
 
 ifeq ($(BUILD_CUDA_EXPERIMENTS),true)
   targets += copy-forall.cuda.${host_arch}
-  targets += copy-kokkos.cuda.kitsune.${host_arch}
 endif
 
 ifeq ($(BUILD_HIP_EXPERIMENTS),true)
   targets += copy-forall.hip.${host_arch}
-  targets += copy-kokkos.hip.kitsune.${host_arch}
   targets += copy-hip.${host_arch}
 endif
 
@@ -22,11 +20,11 @@ copy-forall.opencilk.${host_arch}: copy-forall.cpp
 	@${FILE_SIZE}
 copy-forall.cuda.${host_arch}: copy-forall.cpp 
 	@echo $@
-	${TIME_CMD} $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
+	@${TIME_CMD} $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@${FILE_SIZE}
 copy-forall.hip.${host_arch}: copy-forall.cpp 
 	@echo $@
-	${TIME_CMD} $(KIT_CXX) $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
+	@${TIME_CMD} $(KIT_CXX) $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@${FILE_SIZE}
 
 # kokkos-based tests (w/out views)

diff --git a/kitsune/experiments/euler3d/makefile b/kitsune/experiments/euler3d/makefile
@@ -5,51 +5,43 @@ targets = euler3d-forall.opencilk.${host_arch}
 ifeq ($(BUILD_CUDA_EXPERIMENTS),true)
   targets += euler3d-forall.cuda.${host_arch}
   targets += euler3d-kokkos.kitsune.cuda.${host_arch}
-  targets += euler3d-kokkos.nvcc.${host_arch}
 endif
 
 ifeq ($(BUILD_HIP_EXPERIMENTS),true)
   targets += euler3d-forall.hip.${host_arch}
   targets += euler3d-kokkos.kitsune.hip.${host_arch}
-  targets += euler3d-kokkos.hipcc.${host_arch}
-  targets += euler3d-kokkos.hip.${host_arch}
 endif
 
 all: ${targets}
 
 # forall-based tests
 euler3d-forall.opencilk.${host_arch}: euler3d-forall.cpp 
 	@echo $@
-	@echo $(KIT_CXX) $(TAPIR_OPENCILK_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(TIME_CMD) $(KIT_CXX) $(TAPIR_OPENCILK_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(FILE_SIZE)
 euler3d-forall.cuda.${host_arch}: euler3d-forall.cpp 
 	@echo $@
-	@echo $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(TIME_CMD) $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(FILE_SIZE)
 euler3d-forall.hip.${host_arch}: euler3d-forall.cpp 
 	@echo $@
-	@echo $(KIT_CXX) $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(TIME_CMD) $(KIT_CXX) $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(FILE_SIZE)
 
 # kokkos-based tests (w/out views)
 euler3d-kokkos.kitsune.cuda.${host_arch}: euler3d-kokkos-no-view.cpp 
 	@echo $@
-	@echo $(KIT_CXX) $(TAPIR_CUDA_FLAGS) $(KITSUNE_KOKKOS_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(TIME_CMD) $(KIT_CXX) $(TAPIR_CUDA_FLAGS) $(KITSUNE_KOKKOS_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(FILE_SIZE)
 euler3d-kokkos.kitsune.hip.${host_arch}: euler3d-kokkos-no-view.cpp 
 	@echo $@
-	@echo $(KIT_CXX) $(TAPIR_HIP_FLAGS) $(KITSUNE_KOKKOS_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(TIME_CMD) $(KIT_CXX) $(TAPIR_HIP_FLAGS) $(KITSUNE_KOKKOS_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(FILE_SIZE)
 
 # kokkos-based tests (using views -- currently not compatible w/ kitsune)
 euler3d-kokkos.nvcc.${host_arch}: euler3d-kokkos.cpp 
 	@echo $@
-	$(TIME_CMD) $(KOKKOS_NVCC) $(KOKKOS_NVCC_FLAGS) -o $@ $< $(KOKKOS_CUDA_LIBS) -Xlinker -rpath=$(KOKKOS_CUDA_PREFIX)/lib64 
+	@$(TIME_CMD) $(KOKKOS_NVCC) $(KOKKOS_NVCC_FLAGS) -o $@ $< $(KOKKOS_CUDA_LIBS) -Xlinker -rpath=$(KOKKOS_CUDA_PREFIX)/lib64 
 	@$(FILE_SIZE)
 euler3d-kokkos.hipcc.${host_arch}: euler3d-kokkos.cpp 
 	@echo $@
@@ -58,7 +50,6 @@ euler3d-kokkos.hipcc.${host_arch}: euler3d-kokkos.cpp
 	@$(FILE_SIZE)
 euler3d-kokkos.hip.${host_arch}: euler3d-kokkos.cpp 
 	@echo $@
-	@echo $(KITSUNE_HIPCC) $(KOKKOS_HIP_FLAGS) -I$(ROCM_PATH)/include -o $@ $< -L$(ROCM_PATH)/lib $(KOKKOS_HIP_LIBS) -lamdhip64 -Xlinker -rpath=$(KOKKOS_HIP_PREFIX)/lib64 
 	@$(TIME_CMD) $(KITSUNE_HIPCC) $(KOKKOS_HIP_FLAGS) -I$(ROCM_PATH)/include -o $@ $< -L$(ROCM_PATH)/lib $(KOKKOS_HIP_LIBS) -lamdhip64 -Xlinker -rpath=$(KOKKOS_HIP_PREFIX)/lib64 
 	@$(FILE_SIZE)
 

diff --git a/kitsune/experiments/inc/cuda.mk b/kitsune/experiments/inc/cuda.mk
@@ -17,3 +17,4 @@ ifneq ($(CUDA_PATH),)
 else 
   BUILD_CUDA_EXPERIMENTS=false
 endif 
+
diff --git a/kitsune/experiments/inc/kitsune-tapir.mk b/kitsune/experiments/inc/kitsune-tapir.mk
@@ -24,10 +24,10 @@ TAPIR_CUDA_FLAGS=$(TAPIR_CUDA_TARGET) $(TAPIR_CUDA_TARGET_FLAGS)
  #-mllvm -cuabi-run-post-opts \
  # -mllvm -cuabi-streams=true \
 
-TAPIR_CUDA_LTO_FLAGS?=-Wl,--tapir-target=cuda\
-		      -Wl,--threads=1\
-  		      -Wl,--lto-O${KITSUNE_OPTLEVEL}\
-		      -Wl,-mllvm=-cuabi-opt-level=$(KITSUNE_ABI_OPTLEVEL)\
+TAPIR_CUDA_LTO_FLAGS?=-Wl,--tapir-target=cuda \
+		      -Wl,--threads=1 \
+		      -Wl,--lto-O${KITSUNE_OPTLEVEL} \
+		      -Wl,-mllvm=-cuabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
 		      -Wl,-mllvm=-cuabi-arch=$(CUDA_ARCH)
 
 ifneq ($(KITSUNE_VERBOSE),)
@@ -59,7 +59,12 @@ endif
 ##################################
 TAPIR_OPENCILK_TARGET=-ftapir=opencilk
 TAPIR_OPENCILK_FLAGS?=-ftapir=opencilk -O$(KITSUNE_OPTLEVEL)
-TAPIR_OPENCILK_LTO_FLAGS?=-Wl,--tapir-target=opencilk,--lto-O${KITSUNE_OPTLEVEL}
+TAPIR_OPENCILK_BC_PATH=${KITSUNE_PREFIX}/lib/clang/16/lib/${host_arch}-unknown-linux-gnu
+TAPIR_OPENCILK_BC_FILE=libopencilk-abi.bc
+TAPIR_OPENCILK_LTO_FLAGS?=-ftapir=opencilk -Wl,--lto-O${KITSUNE_OPTLEVEL}
+#TAPIR_OPENCILK_LTO_FLAGS?=-Wl,--tapir-target=opencilk,--lto-O${KITSUNE_OPTLEVEL} \
+#                          -Wl,--opencilk-abi-bitcode=${TAPIR_OPENCILK_BC_PATH}/${TAPIR_OPENCILK_BC_FILE}
+
 ##################################
 
 ##################################

diff --git a/kitsune/experiments/inc/kokkos.mk b/kitsune/experiments/inc/kokkos.mk
@@ -3,10 +3,10 @@ ifeq ($(BUILD_CUDA_EXPERIMENTS),true)
   KOKKOS_CUDA_PREFIX?=$(KITSUNE_PREFIX)/opt/kokkos/cuda
   KOKKOS_CUDA_LIBS=-L$(KOKKOS_CUDA_PREFIX)/lib -L$(KOKKOS_CUDA_PREFIX)/lib64 -L$(CUDA_PATH)/lib64 -lkokkoscore -lcudart -ldl
   KOKKOS_NVCC=$(KOKKOS_CUDA_PREFIX)/bin/nvcc_wrapper
-  KOKKOS_NVCC_FLAGS?= $(NVCC_CXX_FLAGS) -std=c++17 -I$(KOKKOS_CUDA_PREFIX)/include/ 
+  KOKKOS_NVCC_FLAGS?= $(NVCC_CXX_FLAGS) -std=c++17 -I$(KOKKOS_CUDA_PREFIX)/include/ -I$(KOKKOS_CUDA_PREFIX)/include/kokkos
   KOKKOS_CLANG=$(KITSUNE_PREFIX)/bin/clang
   KOKKOS_CLANG_CUDA_PREFIX?=$(KITSUNE_PREFIX)/opt/kokkos/clang-cuda
-  KOKKOS_CLANG_CUDA_FLAGS?= $(CLANG_CUDA_FLAGS) -fPIC -std=c++17 -fno-exceptions -I$(KOKKOS_CUDA_PREFIX)/include/
+  KOKKOS_CLANG_CUDA_FLAGS?= $(CLANG_CUDA_FLAGS) -fPIC -std=c++17 -fno-exceptions -I$(KOKKOS_CUDA_PREFIX)/include/ -I$(KOKKOS_CUDA_PREFIX)/include/kokkos
   KOKKOS_CLANG_CUDA_LIBS=-L$(KOKKOS_CLANG_CUDA_PREFIX)/lib64 -L$(CUDA_PATH)/lib64 -lkokkoscore -lcudart -ldl -lstdc++
 endif
 

diff --git a/kitsune/experiments/multi-target/common.cpp → kitsune/experiments/intersect/common.cpp b/kitsune/experiments/multi-target/common.cpp → kitsune/experiments/intersect/common.cpp
@@ -14,7 +14,7 @@
 
 #include "common.h"
 #include <kitsune.h>
-#include "kitrt/cuda.h"
+// #include "kitrt/cuda.h"
 
 using namespace std;
 
@@ -285,7 +285,7 @@ void create_cell_nodes_gpu(size_t *&nodes, int nx, int ny,
                        std::vector<size_t> *shuffle_nodes = nullptr,
                        std::vector<size_t> *shuffle_cells = nullptr) {
 
-  nodes = allocate<size_t>(KITSUNE, 4 * nx * nx, "source cell nodes");
+  nodes = allocate<size_t>(KITSUNE, 4 * nx * ny, "source cell nodes");
 
   // create cell to node mapping, just iterating the stencil explicitly
   if constexpr (LOG_LEVEL > 0)

diff --git a/kitsune/experiments/intersect/common.h b/kitsune/experiments/intersect/common.h
@@ -11,15 +11,15 @@
 #pragma once
 
 #include <cstdio>
-#include <nvToolsExt.h>
 #include <string>
 #include <vector>
-
-#ifdef _KITSUNE_
 #include <kitsune.h>
-#include "kitsune/kitrt/llvm-gpu.h"
-#include "kitsune/kitrt/kitrt-cuda.h"
-#endif
+// #include "kitrt/cuda.h"
+#include "nvToolsExt.h"
+
+// #include <kitsune.h>
+// #include "kitsune/kitrt/llvm-gpu.h"
+// #include "kitsune/kitrt/kitrt-cuda.h"
 
 const int LOG_LEVEL = 0;
 
@@ -31,9 +31,7 @@ enum PrefetchKinds {
 
 enum MemoryType {
   HOST = 0,    // Allocate on the host
-  KITSUNE = 1, // Let Kitsune do the memory management
-  CUDA = 2,    // Allocate on the device
-  MANAGED = 3  // Use UVM
+  KITSUNE = 1 // Let Kitsune do the memory management
 };
 
 // parse the command line arguments
@@ -67,17 +65,54 @@ void create_meshes(MemoryType memory_type, int nx, int ny, double x_max,
                    size_t *&source_node_offsets, double *&target_coordinates,
                    size_t *&target_cell_nodes, size_t *&target_node_offsets,
                    size_t *&candidates, size_t *&candidate_offsets,
-                   bool shuffle, size_t extra_bytes = 0);
+                   bool shuffle, std::vector<size_t> &shuffle_source_nodes, 
+                   std::vector<size_t> &shuffle_source_cells, std::vector<size_t> &shuffle_target_nodes, size_t extra_bytes = 0);
+
+
+
+
+// create the node coordinates of a rectangular structured mesh
+void create_coordinates_gpu(double *&coordinates, int nx,
+                        int ny, double x_max, double y_max, double shift_x,
+                        double shift_y, const char *label,
+                        std::vector<size_t> *shuffle_nodes, size_t extra_bytes);
+
+// create the cell to node topology
+void create_cell_nodes_gpu(size_t *&cell_nodes, int nx,
+                       int ny, const char *label,
+                       std::vector<size_t> *shuffle_nodes,
+                       std::vector<size_t> *shuffle_cells);
+
+// return sizes so that the candidates can be allocated
+size_t create_candidate_offsets(size_t *&offsets, int nx, int ny);
+
+// two pass algorithm so compact sparse data representation
+void create_candidates_gpu(size_t *&candidates, size_t n_candidates, int nx, int ny,
+                       size_t *offsets, const char *label,
+                       std::vector<size_t> *shuffle_cells);
+
+// create source and target mesh as well as intersection candidates
+void create_meshes_gpu(int nx, int ny, double x_max,
+                   double y_max, double shift_x, double shift_y,
+                   double *&source_coordinates, size_t *source_cell_nodes,
+                   size_t *source_node_offsets, double *&target_coordinates,
+                   size_t *target_cell_nodes, size_t *target_node_offsets,
+                   /*size_t *&candidates,*/ size_t *candidate_offsets,
+                   bool shuffle, std::vector<size_t> &shuffle_source_nodes, 
+                   std::vector<size_t> &shuffle_source_cells, std::vector<size_t> &shuffle_target_nodes);
+
+
+
 
 // check results
 template <class T> int check_equal(T *v1, T *v2, size_t n) {
   size_t n_unequal = 0;
   for (size_t i = 0; i < n; ++i) {
     if (v1[i] != v2[i]) {
       if constexpr (LOG_LEVEL > 0) {
-        if constexpr (std::is_same<T, double *>::value)
+        if constexpr (std::is_same<T, double >::value)
           printf("Vectors not equal at %lu: %f %f\n", i, v1[i], v2[i]);
-        else if constexpr (std::is_same<T, size_t *>::value)
+        else if constexpr (std::is_same<T, size_t >::value)
           printf("Vectors not equal at %lu: %lu %lu\n", i, v1[i], v2[i]);
       }
       ++n_unequal;
@@ -101,35 +136,12 @@ T *allocate(MemoryType memory_type, size_t n, const char *label) {
     message = std::string("malloc ") + label;
     nvtxMark(message.c_str());
     return (T *)malloc(sizeof(T) * n);
-#ifdef _KITSUNE_
   case KITSUNE:
     message = std::string("__kitrt_cuMemAllocManaged ") + label;
     nvtxMark(message.c_str());
-    return (T *)__kitrt_cuMemAllocManaged(sizeof(T) * n);
-  case CUDA:
-  case MANAGED:
-    return nullptr;
-#else
-  case KITSUNE:
-    return nullptr;
-  case CUDA:
-    message = std::string("cudaMalloc ") + label;
-    nvtxMark(message.c_str());
-    if (cudaMalloc(&device_buffer, sizeof(T) * n) != cudaSuccess) {
-      fprintf(stderr, "failed to allocate memory for array %s!\n", label);
-      exit(1);
-    }
-    return device_buffer;
-  case MANAGED:
-    message = std::string("cudaMallocManaged ") + label;
-    nvtxMark(message.c_str());
-    if (cudaMallocManaged(&device_buffer, sizeof(T) * n) != cudaSuccess) {
-      fprintf(stderr, "failed to allocate managed memory for array %s!\n",
-              label);
-      exit(1);
-    }
-    return device_buffer;
-#endif
+    // fprintf(stderr, "Allocating %s\n", message.c_str());
+    return (T *) alloc<T>(n);
+    // return (T *)__kitrt_cuMemAllocManaged(sizeof(T) * n);
   }
   return nullptr;
 }
@@ -151,24 +163,6 @@ void fill(MemoryType memory_type, T *device_buffer, T *host_buffer,
     nvtxMark(message.c_str());
     memcpy(device_buffer, host_buffer, sizeof(T) * n_copy);
     break;
-#ifdef _KITSUNE_
-  case CUDA:
-  case MANAGED:
-    break;
-#else
-  case CUDA:
-    message = std::string("cudaMemcpy ") + label + " to device";
-    nvtxMark(message.c_str());
-    cudaMemcpy(device_buffer, host_buffer, sizeof(T) * n_copy,
-               cudaMemcpyHostToDevice);
-    break;
-  case MANAGED:
-    message = std::string("memcopy ") + label + " to UVM";
-    nvtxMark(message.c_str());
-    memcpy(device_buffer, host_buffer, sizeof(T) * n_copy);
-    if (PFKind == EXPLICIT)
-      cudaMemPrefetchAsync(device_buffer, sizeof(T) * n_copy, 0);
-#endif
   }
 }