Skip to content

Commit

Permalink
improve way to detect whether visible GPUs are present in the python API
Browse files Browse the repository at this point in the history
  • Loading branch information
gschramm committed Apr 16, 2023
1 parent 2ebcad0 commit 7c55aa7
Show file tree
Hide file tree
Showing 8 changed files with 437 additions and 459 deletions.
667 changes: 327 additions & 340 deletions cuda/include/parallelproj_cuda.h

Large diffs are not rendered by default.

68 changes: 40 additions & 28 deletions cuda/src/utils_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,60 @@
* @file utils_cuda.cu
*/

#include<stdio.h>
#include<omp.h>
#include <stdio.h>
#include <omp.h>

extern "C" __global__ void add_to_first_kernel(float* a, float* b, unsigned long long n)
extern "C" __global__ void add_to_first_kernel(float *a, float *b, unsigned long long n)
{
// add a vector b onto a vector a both of length n
// add a vector b onto a vector a both of length n

unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;

if(i < n)
if (i < n)
{
a[i] += b[i];
}
}

//////////////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void print_int_device_array(int* a)
extern "C" __global__ void print_int_device_array(int *a)
{
unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
printf("%lld %d\n", i, a[i]);
}

//////////////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void print_float_device_array(float* a)
extern "C" __global__ void print_float_device_array(float *a)
{
unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
printf("%lld %f\n", i, a[i]);
}

//////////////////////////////////////////////////////////////////////////////////////////
extern "C" float** copy_float_array_to_all_devices(const float *h_array, long long n)
extern "C" float **copy_float_array_to_all_devices(const float *h_array, long long n)
{
cudaError_t error;
cudaError_t error;

// get number of visible devices
int num_devices;
cudaGetDeviceCount(&num_devices);
cudaGetDeviceCount(&num_devices);

// create pointer to device arrays
float **d_array = new float * [num_devices];
float **d_array = new float *[num_devices];

long long array_bytes = n*sizeof(float);
long long array_bytes = n * sizeof(float);

# pragma omp parallel for schedule(static)
for (int i_dev = 0; i_dev < num_devices; i_dev++)
#pragma omp parallel for schedule(static)
for (int i_dev = 0; i_dev < num_devices; i_dev++)
{
cudaSetDevice(i_dev);

error = cudaMalloc(&d_array[i_dev], array_bytes);
if (error != cudaSuccess)
{
printf("cudaMalloc returned error %s (code %d), line(%d)\n", cudaGetErrorString(error),
error, __LINE__);
exit(EXIT_FAILURE);
printf("cudaMalloc returned error %s (code %d), line(%d)\n", cudaGetErrorString(error),
error, __LINE__);
exit(EXIT_FAILURE);
}
cudaMemcpyAsync(d_array[i_dev], h_array, array_bytes, cudaMemcpyHostToDevice);
}
Expand All @@ -74,8 +74,8 @@ extern "C" void free_float_array_on_all_devices(float **d_array)
int num_devices;
cudaGetDeviceCount(&num_devices);

# pragma omp parallel for schedule(static)
for (int i_dev = 0; i_dev < num_devices; i_dev++)
#pragma omp parallel for schedule(static)
for (int i_dev = 0; i_dev < num_devices; i_dev++)
{
cudaFree(d_array[i_dev]);
}
Expand All @@ -88,7 +88,7 @@ extern "C" void free_float_array_on_all_devices(float **d_array)
extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
{

cudaError_t error;
cudaError_t error;
int threadsperblock = 32;
dim3 block(threadsperblock);
int blockspergrid = (int)ceil((float)n / threadsperblock);
Expand All @@ -99,21 +99,21 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)

float *d_array2;

long long array_bytes = n*sizeof(float);
long long array_bytes = n * sizeof(float);

if(num_devices > 1)
if (num_devices > 1)
{
cudaSetDevice(0);

for (int i_dev = 0; i_dev < num_devices; i_dev++)
for (int i_dev = 0; i_dev < num_devices; i_dev++)
{
if(i_dev == 0)
if (i_dev == 0)
{
// allocate memory for aux array to sum arrays on device 0
error = cudaMalloc(&d_array2, array_bytes);
if (error != cudaSuccess)
{
printf("cudaMalloc returned error %s (code %d), line(%d)\n",
printf("cudaMalloc returned error %s (code %d), line(%d)\n",
cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
Expand All @@ -125,12 +125,12 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
cudaMemcpyPeer(d_array2, 0, d_array[i_dev], i_dev, array_bytes);

// call summation kernel to add d_array2 to d_array on device 0
add_to_first_kernel<<<grid,block>>>(d_array[0], d_array2, n);
add_to_first_kernel<<<grid, block>>>(d_array[0], d_array2, n);
}

cudaDeviceSynchronize();
}

cudaFree(d_array2);
}
}
Expand All @@ -139,5 +139,17 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
extern "C" void get_float_array_from_device(float **d_array, long long n, int i_dev, float *h_array)
{
cudaSetDevice(i_dev);
cudaMemcpy(h_array, d_array[i_dev], n*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_array, d_array[i_dev], n * sizeof(float), cudaMemcpyDeviceToHost);
}

////////////////////////////////////////////////////////////////////////////////////////////
extern "C" int get_cuda_device_count()
{
int num_devices = 0;
cudaError_t err = cudaGetDeviceCount(&num_devices);

if (err != cudaSuccess)
num_devices = 0;

return num_devices;
}
3 changes: 2 additions & 1 deletion python/parallelproj/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .config import cuda_enabled, cupy_enabled, get_array_module, XPArray, XPFloat32Array, XPShortArray
from .backend import cuda_present, cupy_enabled, get_array_module, XPArray, XPFloat32Array, XPShortArray
from .backend import num_visible_cuda_devices
from .backend import joseph3d_fwd, joseph3d_back
from .backend import joseph3d_fwd_tof_sino, joseph3d_back_tof_sino
from .backend import joseph3d_fwd_tof_lm, joseph3d_back_tof_lm
73 changes: 62 additions & 11 deletions python/parallelproj/backend.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import importlib
import distutils
import math

import ctypes
Expand All @@ -10,8 +12,28 @@

import numpy as np
import numpy.ctypeslib as npct
import numpy.typing as npt

from parallelproj.config import cuda_enabled, cupy_enabled, XPShortArray, XPFloat32Array, get_array_module
from typing import Union
from types import ModuleType

# check if cuda is present
cuda_present = distutils.spawn.find_executable('nvidia-smi') is not None

# check if cupy is available
cupy_enabled = (importlib.util.find_spec('cupy') is not None)

# define type for cupy or numpy array
if cupy_enabled:
import cupy as cp
import cupy.typing as cpt
XPArray = Union[npt.NDArray, cpt.NDArray]
XPFloat32Array = Union[npt.NDArray[np.float32], cpt.NDArray[np.float32]]
XPShortArray = Union[npt.NDArray[np.int16], cpt.NDArray[np.int16]]
else:
XPArray = npt.NDArray
XPFloat32Array = npt.NDArray[np.float32]
XPShortArray = npt.NDArray[np.int16]

# numpy ctypes lib array definitions
ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags='C')
Expand Down Expand Up @@ -137,7 +159,7 @@

#---------------------------------------------------------------------------------------

if cuda_enabled:
if cuda_present:
if 'PARALLELPROJ_CUDA_LIB' in os.environ:
lib_parallelproj_cuda_fname = os.environ['PARALLELPROJ_CUDA_LIB']
else:
Expand All @@ -152,6 +174,14 @@
os.path.basename(lib_parallelproj_cuda_fname),
os.path.dirname(lib_parallelproj_cuda_fname))

# get the number of visible cuda devices
lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count(
)

if (num_visible_cuda_devices == 0) and cupy_enabled:
cupy_enabled = False

lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
ar_1d_single, # h_xstart
Expand Down Expand Up @@ -387,7 +417,7 @@ def joseph3d_fwd(xstart: XPFloat32Array,
xp.asarray(img_dim)))
xp.cuda.Device().synchronize()
else:
if cuda_enabled:
if num_visible_cuda_devices > 0:
# projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])

Expand Down Expand Up @@ -461,7 +491,7 @@ def joseph3d_back(xstart: XPFloat32Array,
np.int64(nLORs), xp.asarray(img_dim)))
xp.cuda.Device().synchronize()
else:
if cuda_enabled:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])

Expand Down Expand Up @@ -569,7 +599,7 @@ def joseph3d_fwd_tof_sino(xstart: XPFloat32Array,
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if cuda_enabled:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])

Expand Down Expand Up @@ -689,7 +719,7 @@ def joseph3d_back_tof_sino(xstart: XPFloat32Array,
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if cuda_enabled:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])

Expand Down Expand Up @@ -815,7 +845,7 @@ def joseph3d_fwd_tof_lm(xstart: XPFloat32Array,
lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if cuda_enabled:
if num_visible_cuda_devices > 0:
# projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])

Expand Down Expand Up @@ -933,7 +963,7 @@ def joseph3d_back_tof_lm(xstart: XPFloat32Array,
lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if cuda_enabled:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])

Expand Down Expand Up @@ -982,7 +1012,28 @@ def joseph3d_back_tof_lm(xstart: XPFloat32Array,
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back_tof_lm(
xstart.ravel(), xend.ravel(), back_img.ravel(), img_origin,
voxsize, lst, np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
xstart.ravel(),
xend.ravel(), back_img.ravel(), img_origin, voxsize, lst,
np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
tofcenter_offset, nsigmas, tofbin, lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset)
lor_dependent_tofcenter_offset)


#-----------------------------------------------------------------------------


def get_array_module(array) -> ModuleType:
"""return module of a cupy or numpy array
Parameters
----------
array : cupy or numpy array
Returns
-------
cupy or numpy module
"""
if cupy_enabled:
return cp.get_array_module(array)
else:
return np
43 changes: 0 additions & 43 deletions python/parallelproj/config.py

This file was deleted.

Loading

0 comments on commit 7c55aa7

Please sign in to comment.