improve way to detect whether visible GPUs are present in the python API

gschramm · Apr 16, 2023 · 7c55aa7 · 7c55aa7
1 parent 2ebcad0
commit 7c55aa7
Show file tree

Hide file tree

Showing 8 changed files with 437 additions and 459 deletions.
diff --git a/cuda/include/parallelproj_cuda.h b/cuda/include/parallelproj_cuda.h
diff --git a/cuda/src/utils_cuda.cu b/cuda/src/utils_cuda.cu
@@ -2,60 +2,60 @@
  * @file utils_cuda.cu
  */
 
-#include<stdio.h>
-#include<omp.h>
+#include <stdio.h>
+#include <omp.h>
 
-extern "C" __global__ void add_to_first_kernel(float* a, float* b, unsigned long long n)
+extern "C" __global__ void add_to_first_kernel(float *a, float *b, unsigned long long n)
 {
-// add a vector b onto a vector a both of length n
+  // add a vector b onto a vector a both of length n
 
   unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if(i < n)
+  if (i < n)
   {
     a[i] += b[i];
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void print_int_device_array(int* a)
+extern "C" __global__ void print_int_device_array(int *a)
 {
   unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
   printf("%lld %d\n", i, a[i]);
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void print_float_device_array(float* a)
+extern "C" __global__ void print_float_device_array(float *a)
 {
   unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
   printf("%lld %f\n", i, a[i]);
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" float** copy_float_array_to_all_devices(const float *h_array, long long n)
+extern "C" float **copy_float_array_to_all_devices(const float *h_array, long long n)
 {
-  cudaError_t error;  
+  cudaError_t error;
 
   // get number of visible devices
   int num_devices;
-  cudaGetDeviceCount(&num_devices);  
+  cudaGetDeviceCount(&num_devices);
 
   // create pointer to device arrays
-  float **d_array = new float * [num_devices];
+  float **d_array = new float *[num_devices];
 
-  long long array_bytes = n*sizeof(float);
+  long long array_bytes = n * sizeof(float);
 
-  # pragma omp parallel for schedule(static)
-  for (int i_dev = 0; i_dev < num_devices; i_dev++) 
+#pragma omp parallel for schedule(static)
+  for (int i_dev = 0; i_dev < num_devices; i_dev++)
   {
     cudaSetDevice(i_dev);
 
     error = cudaMalloc(&d_array[i_dev], array_bytes);
     if (error != cudaSuccess)
     {
-        printf("cudaMalloc returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), 
-               error, __LINE__);
-        exit(EXIT_FAILURE);
+      printf("cudaMalloc returned error %s (code %d), line(%d)\n", cudaGetErrorString(error),
+             error, __LINE__);
+      exit(EXIT_FAILURE);
     }
     cudaMemcpyAsync(d_array[i_dev], h_array, array_bytes, cudaMemcpyHostToDevice);
   }
@@ -74,8 +74,8 @@ extern "C" void free_float_array_on_all_devices(float **d_array)
   int num_devices;
   cudaGetDeviceCount(&num_devices);
 
-  # pragma omp parallel for schedule(static)
-  for (int i_dev = 0; i_dev < num_devices; i_dev++) 
+#pragma omp parallel for schedule(static)
+  for (int i_dev = 0; i_dev < num_devices; i_dev++)
   {
     cudaFree(d_array[i_dev]);
   }
@@ -88,7 +88,7 @@ extern "C" void free_float_array_on_all_devices(float **d_array)
 extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
 {
 
-  cudaError_t error;  
+  cudaError_t error;
   int threadsperblock = 32;
   dim3 block(threadsperblock);
   int blockspergrid = (int)ceil((float)n / threadsperblock);
@@ -99,21 +99,21 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
 
   float *d_array2;
 
-  long long array_bytes = n*sizeof(float);
+  long long array_bytes = n * sizeof(float);
 
-  if(num_devices > 1)
+  if (num_devices > 1)
   {
     cudaSetDevice(0);
 
-    for (int i_dev = 0; i_dev < num_devices; i_dev++) 
+    for (int i_dev = 0; i_dev < num_devices; i_dev++)
     {
-      if(i_dev == 0)
+      if (i_dev == 0)
       {
         // allocate memory for aux array to sum arrays on device 0
         error = cudaMalloc(&d_array2, array_bytes);
         if (error != cudaSuccess)
         {
-          printf("cudaMalloc returned error %s (code %d), line(%d)\n", 
+          printf("cudaMalloc returned error %s (code %d), line(%d)\n",
                  cudaGetErrorString(error), error, __LINE__);
           exit(EXIT_FAILURE);
         }
@@ -125,12 +125,12 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
         cudaMemcpyPeer(d_array2, 0, d_array[i_dev], i_dev, array_bytes);
 
         // call summation kernel to add d_array2 to d_array on device 0
-        add_to_first_kernel<<<grid,block>>>(d_array[0], d_array2, n);
+        add_to_first_kernel<<<grid, block>>>(d_array[0], d_array2, n);
       }
 
       cudaDeviceSynchronize();
     }
-    
+
     cudaFree(d_array2);
   }
 }
@@ -139,5 +139,17 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
 extern "C" void get_float_array_from_device(float **d_array, long long n, int i_dev, float *h_array)
 {
   cudaSetDevice(i_dev);
-  cudaMemcpy(h_array, d_array[i_dev], n*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_array, d_array[i_dev], n * sizeof(float), cudaMemcpyDeviceToHost);
 }
+
+////////////////////////////////////////////////////////////////////////////////////////////
+extern "C" int get_cuda_device_count()
+{
+  int num_devices = 0;
+  cudaError_t err = cudaGetDeviceCount(&num_devices);
+
+  if (err != cudaSuccess)
+    num_devices = 0;
+
+  return num_devices;
+}
diff --git a/python/parallelproj/__init__.py b/python/parallelproj/__init__.py
@@ -1,4 +1,5 @@
-from .config import cuda_enabled, cupy_enabled, get_array_module, XPArray, XPFloat32Array, XPShortArray
+from .backend import cuda_present, cupy_enabled, get_array_module, XPArray, XPFloat32Array, XPShortArray
+from .backend import num_visible_cuda_devices
 from .backend import joseph3d_fwd, joseph3d_back
 from .backend import joseph3d_fwd_tof_sino, joseph3d_back_tof_sino
 from .backend import joseph3d_fwd_tof_lm, joseph3d_back_tof_lm
diff --git a/python/parallelproj/backend.py b/python/parallelproj/backend.py
@@ -1,4 +1,6 @@
 import os
+import importlib
+import distutils
 import math
 
 import ctypes
@@ -10,8 +12,28 @@
 
 import numpy as np
 import numpy.ctypeslib as npct
+import numpy.typing as npt
 
-from parallelproj.config import cuda_enabled, cupy_enabled, XPShortArray, XPFloat32Array, get_array_module
+from typing import Union
+from types import ModuleType
+
+# check if cuda is present
+cuda_present = distutils.spawn.find_executable('nvidia-smi') is not None
+
+# check if cupy is available
+cupy_enabled = (importlib.util.find_spec('cupy') is not None)
+
+# define type for cupy or numpy array
+if cupy_enabled:
+    import cupy as cp
+    import cupy.typing as cpt
+    XPArray = Union[npt.NDArray, cpt.NDArray]
+    XPFloat32Array = Union[npt.NDArray[np.float32], cpt.NDArray[np.float32]]
+    XPShortArray = Union[npt.NDArray[np.int16], cpt.NDArray[np.int16]]
+else:
+    XPArray = npt.NDArray
+    XPFloat32Array = npt.NDArray[np.float32]
+    XPShortArray = npt.NDArray[np.int16]
 
 # numpy ctypes lib array definitions
 ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags='C')
@@ -137,7 +159,7 @@
 
 #---------------------------------------------------------------------------------------
 
-if cuda_enabled:
+if cuda_present:
     if 'PARALLELPROJ_CUDA_LIB' in os.environ:
         lib_parallelproj_cuda_fname = os.environ['PARALLELPROJ_CUDA_LIB']
     else:
@@ -152,6 +174,14 @@
             os.path.basename(lib_parallelproj_cuda_fname),
             os.path.dirname(lib_parallelproj_cuda_fname))
 
+        # get the number of visible cuda devices
+        lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
+        num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count(
+        )
+
+        if (num_visible_cuda_devices == 0) and cupy_enabled:
+            cupy_enabled = False
+
         lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
         lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
             ar_1d_single,  # h_xstart
@@ -387,7 +417,7 @@ def joseph3d_fwd(xstart: XPFloat32Array,
              xp.asarray(img_dim)))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -461,7 +491,7 @@ def joseph3d_back(xstart: XPFloat32Array,
              np.int64(nLORs), xp.asarray(img_dim)))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -569,7 +599,7 @@ def joseph3d_fwd_tof_sino(xstart: XPFloat32Array,
              lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -689,7 +719,7 @@ def joseph3d_back_tof_sino(xstart: XPFloat32Array,
              lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -815,7 +845,7 @@ def joseph3d_fwd_tof_lm(xstart: XPFloat32Array,
              lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -933,7 +963,7 @@ def joseph3d_back_tof_lm(xstart: XPFloat32Array,
              lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -982,7 +1012,28 @@ def joseph3d_back_tof_lm(xstart: XPFloat32Array,
         else:
             # back projection of numpy array using the openmp parallelproj lib
             lib_parallelproj_c.joseph3d_back_tof_lm(
-                xstart.ravel(), xend.ravel(), back_img.ravel(), img_origin,
-                voxsize, lst, np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
+                xstart.ravel(),
+                xend.ravel(), back_img.ravel(), img_origin, voxsize, lst,
+                np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
                 tofcenter_offset, nsigmas, tofbin, lor_dependent_sigma_tof,
-                lor_dependent_tofcenter_offset)
+                lor_dependent_tofcenter_offset)
+
+
+#-----------------------------------------------------------------------------
+
+
+def get_array_module(array) -> ModuleType:
+    """return module of a cupy or numpy array
+
+    Parameters
+    ----------
+    array : cupy or numpy array
+
+    Returns
+    -------
+    cupy or numpy module
+    """
+    if cupy_enabled:
+        return cp.get_array_module(array)
+    else:
+        return np
diff --git a/python/parallelproj/config.py b/python/parallelproj/config.py