diff --git a/cuda/include/parallelproj_cuda.h b/cuda/include/parallelproj_cuda.h
index d1e2384b..5c6d2383 100644
--- a/cuda/include/parallelproj_cuda.h
+++ b/cuda/include/parallelproj_cuda.h
@@ -5,350 +5,337 @@
 #ifndef __PARALLELPROJ_CUDA_H__
 #define __PARALLELPROJ_CUDA_H__
 
-#ifdef  __cplusplus
-extern "C" {
+#ifdef __cplusplus
+extern "C"
+{
 #endif
 
-/** @brief 3D non-tof joseph back projector CUDA wrapper
- *
- *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to 
- *                  used to store the back projections.
- *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
- *                  The backprojector adds existing values.
- *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
- *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
- *  @param h_p           array of length nlors containg the values to be back projected
- *  @param nlors          number of projections (length of p array)
- *  @param h_img_dim      array with dimensions of image [n0,n1,n2]
- *  @param threadsperblock number of threads per block
- */
-void joseph3d_back_cuda(const float *h_xstart, 
-                        const float *h_xend, 
-                        float **d_img,
-                        const float *h_img_origin, 
-                        const float *h_voxsize, 
-                        const float *h_p,
-                        long long nlors, 
-                        const int *h_img_dim, 
-                        int threadsperblock);
-
-
-
-/** @brief 3D listmode tof joseph back projector CUDA wrapper
- *
- *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to 
- *                  used to store the back projections.
- *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
- *                  The backprojector adds existing values.
- *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
- *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
- *  @param h_p           array of length nlors containg the values to be back projected
- *  @param nlors         number of projections (length of p array)
- *  @param h_img_dim     array with dimensions of image [n0,n1,n2]
- *  @param tofbin_width     width of the TOF bins in spatial units (units of xstart and xend)
- *  @param h_sigma_tof      array of length 1 or nlors (depending on lor_dependent_sigma_tof)
- *                          with the TOF resolution (sigma) for each LOR in
- *                          spatial units (units of xstart and xend) 
- *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
- *                            with the offset of the central TOF bin from the 
- *                            midpoint of each LOR in spatial units (units of xstart and xend). 
- *                            A positive value means a shift towards the end point of the LOR.
- *  @param n_sigmas        number of sigmas to consider for calculation of TOF kernel
- *  @param h_tof_bin       array containing the TOF bin of each event
- *  @param lor_dependent_sigma_tof unsigned char 0 or 1
- *                                 1 means that the TOF sigmas are LOR dependent
- *                                 any other value means that the first value in the sigma_tof
- *                                 array is used for all LORs
- *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
- *                                        1 means that the TOF center offsets are LOR dependent
- *                                        any other value means that the first value in the 
- *                                        tofcenter_offset array is used for all LORs
- *  @param threadsperblock number of threads per block
- */
-void joseph3d_back_tof_lm_cuda(const float *h_xstart,
-                               const float *h_xend,
-                               float **d_img,
-                               const float *h_img_origin,
-                               const float *h_voxsize,
-                               const float *h_p,
-                               long long nlors,
-                               const int *h_img_dim,
-                               float tofbin_width,
-                               const float *h_sigma_tof,
-                               const float *h_tofcenter_offset,
-                               float n_sigmas,
-                               const short *h_tof_bin,
-                               unsigned char lor_dependent_sigma_tof,
-                               unsigned char lor_dependent_tofcenter_offset,
-                               int threadsperblock);
-
-
-
-/** @brief 3D sinogram tof joseph back projector CUDA wrapper
- *
- *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to 
- *                  used to store the back projections.
- *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
- *                  The backprojector adds existing values.
- *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
- *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
- *  @param h_p           array of length nlors*n_tofbins with the values to be back projected
- *                       the order of the array is 
- *                       [LOR0-TOFBIN-0, LOR0-TOFBIN-1, ... LOR0_TOFBIN-(n-1), 
- *                        LOR1-TOFBIN-0, LOR1-TOFBIN-1, ... LOR1_TOFBIN-(n-1), 
- *                        ...
- *                        LOR(N-1)-TOFBIN-0, LOR(N-1)-TOFBIN-1, ... LOR(N-1)_TOFBIN-(n-1)] 
- *  @param nlors          number of gemeometrical projections
- *  @param h_img_dim      array with dimensions of image [n0,n1,n2]
- *  @param tofbin_width     width of the TOF bins in spatial units (units of xstart and xend)
- *  @param h_sigma_tof      array of length 1 or nlors (depending on lor_dependent_sigma_tof)
- *                          with the TOF resolution (sigma) for each LOR in
- *                          spatial units (units of xstart and xend) 
- *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
- *                            with the offset of the central TOF bin from the 
- *                            midpoint of each LOR in spatial units (units of xstart and xend). 
- *                            A positive value means a shift towards the end point of the LOR.
- *  @param n_sigmas           number of sigmas to consider for calculation of TOF kernel
- *  @param n_tofbins          number of TOF bins
- *  @param lor_dependent_sigma_tof unsigned char 0 or 1
- *                                 1 means that the TOF sigmas are LOR dependent
- *                                 any other value means that the first value in the sigma_tof
- *                                 array is used for all LORs
- *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
- *                                        1 means that the TOF center offsets are LOR dependent
- *                                        any other value means that the first value in the 
- *                                        tofcenter_offset array is used for all LORs
- *  @param threadsperblock number of threads per block
- */
-void joseph3d_back_tof_sino_cuda(const float *h_xstart, 
-                                 const float *h_xend, 
-                                 float **d_img,
-                                 const float *h_img_origin, 
-                                 const float *h_voxsize, 
-                                 const float *h_p,
-                                 long long nlors, 
-                                 const int *h_img_dim, 
-                                 float tofbin_width,
-                                 const float *h_sigma_tof,
-                                 const float *h_tofcenter_offset,
-                                 float n_sigmas,
-                                 short n_tofbins,
-                                 unsigned char lor_dependent_sigma_tof,
-                                 unsigned char lor_dependent_tofcenter_offset,
-                                 int threadsperblock);
-
-
-
-/** @brief 3D non-tof joseph forward projector CUDA wrapper
- *
- *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to 
- *                  be projected.
- *                  The pixel [i,j,k] ist stored at [n1*n2*i + n2*j + k].
- *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
- *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
- *  @param h_p           array of length nlors (output) used to store the projections
- *  @param nlors           number of projections (length of p array)
- *  @param h_img_dim       array with dimensions of image [n0,n1,n2]
- *  @param threadsperblock number of threads per block
- */
-void joseph3d_fwd_cuda(const float *h_xstart, 
-                       const float *h_xend, 
-                       float **d_img,
-                       const float *h_img_origin, 
-                       const float *h_voxsize, 
-                       float *h_p,
-                       long long nlors, 
-                       const int *h_img_dim,
-                       int threadsperblock);
-
-
-
-/** @brief 3D listmode tof joseph forward projector CUDA wrapper
- *
- *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to 
- *                  used to store the back projections.
- *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
- *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
- *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
- *  @param h_p           array of length nlors (output) used to store the projections
- *  @param nlors         number of projections (length of p array)
- *  @param h_img_dim     array with dimensions of image [n0,n1,n2]
- *  @param tofbin_width  width of the TOF bins in spatial units (units of xstart and xend)
- *  @param h_sigma_tof   array of length 1 or nlors (depending on lor_dependent_sigma_tof)
- *                       with the TOF resolution (sigma) for each LOR in
- *                       spatial units (units of xstart and xend) 
- *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
- *                            with the offset of the central TOF bin from the 
- *                            midpoint of each LOR in spatial units (units of xstart and xend). 
- *                            A positive value means a shift towards the end point of the LOR.
- *  @param n_sigmas           number of sigmas to consider for calculation of TOF kernel
- *  @param h_tof_bin          array of length nlors with the tofbin of every event 
- *  @param lor_dependent_sigma_tof unsigned char 0 or 1
- *                                 1 means that the TOF sigmas are LOR dependent
- *                                 any other value means that the first value in the sigma_tof
- *                                 array is used for all LORs
- *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
- *                                        1 means that the TOF center offsets are LOR dependent
- *                                        any other value means that the first value in the 
- *                                        tofcenter_offset array is used for all LORs
- *  @param threadsperblock    number of threads per block
- */
-void joseph3d_fwd_tof_lm_cuda(const float *h_xstart, 
-                              const float *h_xend, 
-                              float **d_img,
-                              const float *h_img_origin, 
-                              const float *h_voxsize, 
-                              float *h_p,
-                              long long nlors, 
-                              const int *h_img_dim,
-                              float tofbin_width,
-                              const float *h_sigma_tof,
-                              const float *h_tofcenter_offset,
-                              float n_sigmas,
-                              const short *h_tof_bin,
-                              unsigned char lor_dependent_sigma_tof,
-                              unsigned char lor_dependent_tofcenter_offset,
-                              int threadsperblock);
-
-
-
-/** @brief 3D sinogram tof joseph forward projector CUDA wrapper
- *
- *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
- *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2. 
- *                  Units are the ones of voxsize.
- *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to 
- *                  used to store the back projections.
- *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
- *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
- *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
- *  @param h_p           array of length nlors*n_tofbins (output) used to store the projections
- *                       the order of the array is
- *                       [LOR0-TOFBIN-0, LOR0-TOFBIN-1, ... LOR0_TOFBIN-(n-1), 
- *                        LOR1-TOFBIN-0, LOR1-TOFBIN-1, ... LOR1_TOFBIN-(n-1), 
- *                        ...
- *                        LOR(N-1)-TOFBIN-0, LOR(N-1)-TOFBIN-1, ... LOR(N-1)_TOFBIN-(n-1)] 
- *  @param nlors         number of geometrical LORs
- *  @param h_img_dim     array with dimensions of image [n0,n1,n2]
- *  @param tofbin_width  width of the TOF bins in spatial units (units of xstart and xend)
- *  @param h_sigma_tof   array of length 1 or nlors (depending on lor_dependent_sigma_tof)
- *                       with the TOF resolution (sigma) for each LOR in
- *                       spatial units (units of xstart and xend) 
- *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
- *                            with the offset of the central TOF bin from the 
- *                            midpoint of each LOR in spatial units (units of xstart and xend). 
- *                            A positive value means a shift towards the end point of the LOR.
- *  @param n_sigmas           number of sigmas to consider for calculation of TOF kernel
- *  @param n_tofbins          number of TOF bins
- *  @param lor_dependent_sigma_tof unsigned char 0 or 1
- *                                 1 means that the TOF sigmas are LOR dependent
- *                                 any other value means that the first value in the sigma_tof
- *                                 array is used for all LORs
- *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
- *                                        1 means that the TOF center offsets are LOR dependent
- *                                        any other value means that the first value in the 
- *                                        tofcenter_offset array is used for all LORs
- *  @param threadsperblock    number of threads per block
- */
-void joseph3d_fwd_tof_sino_cuda(const float *h_xstart, 
-                                const float *h_xend, 
-                                float **d_img,
-                                const float *h_img_origin, 
-                                const float *h_voxsize, 
-                                float *h_p,
-                                long long nlors, 
-                                const int *h_img_dim,
-                                float tofbin_width,
-                                const float *h_sigma_tof,
-                                const float *h_tofcenter_offset,
-                                float n_sigmas,
-                                short n_tofbins,
-                                unsigned char lor_dependent_sigma_tof,
-                                unsigned char lor_dependent_tofcenter_offset,
-                                int threadsperblock);
-
-
-
-/** @brief copy a float array to all visible cuda devices
- *
- *  The number of visible cuda devices is determined automatically via the CUDA API
- *
- *  @param    h_array   array of shape [n] on the host
- *  @param    n         number of array elements
- *  @return   a pointer to all devices arrays
- */
-float** copy_float_array_to_all_devices(const float *h_array, long long n);
-
-
-
-/** @brief free device array on all visible cuda devices
- *
- *  The number of visible cuda devices is determined automatically via the CUDA API
- *
- *  @param d_array a pointer to all devices arrays
- */
-void free_float_array_on_all_devices(float **d_array);
-
-
-
-/** @brief sum multiple versions of an array on different devices on first device
- *
- *  The number of visible cuda devices is determined automatically via the CUDA API
- *  This becomes usefule when multiple devices backproject into separate images.
- *
- *  @param d_array a pointer to all devices arrays
- *  @param    n         number of array elements
- */
-void sum_float_arrays_on_first_device(float **d_array, long long n);
-
-
-
-/** @brief copy a (summed) float array from first device back to host
- *
- *  The number of visible cuda devices is determined automatically via the CUDA API
- *
- *  @param  d_array   a pointer to all devices arrays of shape [n]
- *  @param  n         number of array elements
- *  @param  i_dev     device number
- *  @param  h_array   array of shape [n] on the host used for output
- */
-void get_float_array_from_device(float **d_array, long long n, int i_dev, float *h_array);
+    /** @brief 3D non-tof joseph back projector CUDA wrapper
+     *
+     *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to
+     *                  used to store the back projections.
+     *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
+     *                  The backprojector adds existing values.
+     *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
+     *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
+     *  @param h_p           array of length nlors containg the values to be back projected
+     *  @param nlors          number of projections (length of p array)
+     *  @param h_img_dim      array with dimensions of image [n0,n1,n2]
+     *  @param threadsperblock number of threads per block
+     */
+    void joseph3d_back_cuda(const float *h_xstart,
+                            const float *h_xend,
+                            float **d_img,
+                            const float *h_img_origin,
+                            const float *h_voxsize,
+                            const float *h_p,
+                            long long nlors,
+                            const int *h_img_dim,
+                            int threadsperblock);
+
+    /** @brief 3D listmode tof joseph back projector CUDA wrapper
+     *
+     *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to
+     *                  used to store the back projections.
+     *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
+     *                  The backprojector adds existing values.
+     *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
+     *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
+     *  @param h_p           array of length nlors containg the values to be back projected
+     *  @param nlors         number of projections (length of p array)
+     *  @param h_img_dim     array with dimensions of image [n0,n1,n2]
+     *  @param tofbin_width     width of the TOF bins in spatial units (units of xstart and xend)
+     *  @param h_sigma_tof      array of length 1 or nlors (depending on lor_dependent_sigma_tof)
+     *                          with the TOF resolution (sigma) for each LOR in
+     *                          spatial units (units of xstart and xend)
+     *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
+     *                            with the offset of the central TOF bin from the
+     *                            midpoint of each LOR in spatial units (units of xstart and xend).
+     *                            A positive value means a shift towards the end point of the LOR.
+     *  @param n_sigmas        number of sigmas to consider for calculation of TOF kernel
+     *  @param h_tof_bin       array containing the TOF bin of each event
+     *  @param lor_dependent_sigma_tof unsigned char 0 or 1
+     *                                 1 means that the TOF sigmas are LOR dependent
+     *                                 any other value means that the first value in the sigma_tof
+     *                                 array is used for all LORs
+     *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
+     *                                        1 means that the TOF center offsets are LOR dependent
+     *                                        any other value means that the first value in the
+     *                                        tofcenter_offset array is used for all LORs
+     *  @param threadsperblock number of threads per block
+     */
+    void joseph3d_back_tof_lm_cuda(const float *h_xstart,
+                                   const float *h_xend,
+                                   float **d_img,
+                                   const float *h_img_origin,
+                                   const float *h_voxsize,
+                                   const float *h_p,
+                                   long long nlors,
+                                   const int *h_img_dim,
+                                   float tofbin_width,
+                                   const float *h_sigma_tof,
+                                   const float *h_tofcenter_offset,
+                                   float n_sigmas,
+                                   const short *h_tof_bin,
+                                   unsigned char lor_dependent_sigma_tof,
+                                   unsigned char lor_dependent_tofcenter_offset,
+                                   int threadsperblock);
+
+    /** @brief 3D sinogram tof joseph back projector CUDA wrapper
+     *
+     *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to
+     *                  used to store the back projections.
+     *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
+     *                  The backprojector adds existing values.
+     *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
+     *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
+     *  @param h_p           array of length nlors*n_tofbins with the values to be back projected
+     *                       the order of the array is
+     *                       [LOR0-TOFBIN-0, LOR0-TOFBIN-1, ... LOR0_TOFBIN-(n-1),
+     *                        LOR1-TOFBIN-0, LOR1-TOFBIN-1, ... LOR1_TOFBIN-(n-1),
+     *                        ...
+     *                        LOR(N-1)-TOFBIN-0, LOR(N-1)-TOFBIN-1, ... LOR(N-1)_TOFBIN-(n-1)]
+     *  @param nlors          number of gemeometrical projections
+     *  @param h_img_dim      array with dimensions of image [n0,n1,n2]
+     *  @param tofbin_width     width of the TOF bins in spatial units (units of xstart and xend)
+     *  @param h_sigma_tof      array of length 1 or nlors (depending on lor_dependent_sigma_tof)
+     *                          with the TOF resolution (sigma) for each LOR in
+     *                          spatial units (units of xstart and xend)
+     *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
+     *                            with the offset of the central TOF bin from the
+     *                            midpoint of each LOR in spatial units (units of xstart and xend).
+     *                            A positive value means a shift towards the end point of the LOR.
+     *  @param n_sigmas           number of sigmas to consider for calculation of TOF kernel
+     *  @param n_tofbins          number of TOF bins
+     *  @param lor_dependent_sigma_tof unsigned char 0 or 1
+     *                                 1 means that the TOF sigmas are LOR dependent
+     *                                 any other value means that the first value in the sigma_tof
+     *                                 array is used for all LORs
+     *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
+     *                                        1 means that the TOF center offsets are LOR dependent
+     *                                        any other value means that the first value in the
+     *                                        tofcenter_offset array is used for all LORs
+     *  @param threadsperblock number of threads per block
+     */
+    void joseph3d_back_tof_sino_cuda(const float *h_xstart,
+                                     const float *h_xend,
+                                     float **d_img,
+                                     const float *h_img_origin,
+                                     const float *h_voxsize,
+                                     const float *h_p,
+                                     long long nlors,
+                                     const int *h_img_dim,
+                                     float tofbin_width,
+                                     const float *h_sigma_tof,
+                                     const float *h_tofcenter_offset,
+                                     float n_sigmas,
+                                     short n_tofbins,
+                                     unsigned char lor_dependent_sigma_tof,
+                                     unsigned char lor_dependent_tofcenter_offset,
+                                     int threadsperblock);
+
+    /** @brief 3D non-tof joseph forward projector CUDA wrapper
+     *
+     *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to
+     *                  be projected.
+     *                  The pixel [i,j,k] ist stored at [n1*n2*i + n2*j + k].
+     *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
+     *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
+     *  @param h_p           array of length nlors (output) used to store the projections
+     *  @param nlors           number of projections (length of p array)
+     *  @param h_img_dim       array with dimensions of image [n0,n1,n2]
+     *  @param threadsperblock number of threads per block
+     */
+    void joseph3d_fwd_cuda(const float *h_xstart,
+                           const float *h_xend,
+                           float **d_img,
+                           const float *h_img_origin,
+                           const float *h_voxsize,
+                           float *h_p,
+                           long long nlors,
+                           const int *h_img_dim,
+                           int threadsperblock);
+
+    /** @brief 3D listmode tof joseph forward projector CUDA wrapper
+     *
+     *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to
+     *                  used to store the back projections.
+     *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
+     *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
+     *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
+     *  @param h_p           array of length nlors (output) used to store the projections
+     *  @param nlors         number of projections (length of p array)
+     *  @param h_img_dim     array with dimensions of image [n0,n1,n2]
+     *  @param tofbin_width  width of the TOF bins in spatial units (units of xstart and xend)
+     *  @param h_sigma_tof   array of length 1 or nlors (depending on lor_dependent_sigma_tof)
+     *                       with the TOF resolution (sigma) for each LOR in
+     *                       spatial units (units of xstart and xend)
+     *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
+     *                            with the offset of the central TOF bin from the
+     *                            midpoint of each LOR in spatial units (units of xstart and xend).
+     *                            A positive value means a shift towards the end point of the LOR.
+     *  @param n_sigmas           number of sigmas to consider for calculation of TOF kernel
+     *  @param h_tof_bin          array of length nlors with the tofbin of every event
+     *  @param lor_dependent_sigma_tof unsigned char 0 or 1
+     *                                 1 means that the TOF sigmas are LOR dependent
+     *                                 any other value means that the first value in the sigma_tof
+     *                                 array is used for all LORs
+     *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
+     *                                        1 means that the TOF center offsets are LOR dependent
+     *                                        any other value means that the first value in the
+     *                                        tofcenter_offset array is used for all LORs
+     *  @param threadsperblock    number of threads per block
+     */
+    void joseph3d_fwd_tof_lm_cuda(const float *h_xstart,
+                                  const float *h_xend,
+                                  float **d_img,
+                                  const float *h_img_origin,
+                                  const float *h_voxsize,
+                                  float *h_p,
+                                  long long nlors,
+                                  const int *h_img_dim,
+                                  float tofbin_width,
+                                  const float *h_sigma_tof,
+                                  const float *h_tofcenter_offset,
+                                  float n_sigmas,
+                                  const short *h_tof_bin,
+                                  unsigned char lor_dependent_sigma_tof,
+                                  unsigned char lor_dependent_tofcenter_offset,
+                                  int threadsperblock);
+
+    /** @brief 3D sinogram tof joseph forward projector CUDA wrapper
+     *
+     *  @param h_xstart array of shape [3*nlors] with the coordinates of the start points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param h_xend   array of shape [3*nlors] with the coordinates of the end   points of the LORs.
+     *                  The start coordinates of the n-th LOR are at xstart[n*3 + i] with i = 0,1,2.
+     *                  Units are the ones of voxsize.
+     *  @param d_img    Pointer to device arrays of shape [n0*n1*n2] containing the 3D image to
+     *                  used to store the back projections.
+     *                  The pixel [i,j,k] is stored at [n1*n2*i + n2*j + k].
+     *  @param h_img_origin  array [x0_0,x0_1,x0_2] of coordinates of the center of the [0,0,0] voxel
+     *  @param h_voxsize     array [vs0, vs1, vs2] of the voxel sizes
+     *  @param h_p           array of length nlors*n_tofbins (output) used to store the projections
+     *                       the order of the array is
+     *                       [LOR0-TOFBIN-0, LOR0-TOFBIN-1, ... LOR0_TOFBIN-(n-1),
+     *                        LOR1-TOFBIN-0, LOR1-TOFBIN-1, ... LOR1_TOFBIN-(n-1),
+     *                        ...
+     *                        LOR(N-1)-TOFBIN-0, LOR(N-1)-TOFBIN-1, ... LOR(N-1)_TOFBIN-(n-1)]
+     *  @param nlors         number of geometrical LORs
+     *  @param h_img_dim     array with dimensions of image [n0,n1,n2]
+     *  @param tofbin_width  width of the TOF bins in spatial units (units of xstart and xend)
+     *  @param h_sigma_tof   array of length 1 or nlors (depending on lor_dependent_sigma_tof)
+     *                       with the TOF resolution (sigma) for each LOR in
+     *                       spatial units (units of xstart and xend)
+     *  @param h_tofcenter_offset array of length 1 or nlors (depending on lor_dependent_tofcenter_offset)
+     *                            with the offset of the central TOF bin from the
+     *                            midpoint of each LOR in spatial units (units of xstart and xend).
+     *                            A positive value means a shift towards the end point of the LOR.
+     *  @param n_sigmas           number of sigmas to consider for calculation of TOF kernel
+     *  @param n_tofbins          number of TOF bins
+     *  @param lor_dependent_sigma_tof unsigned char 0 or 1
+     *                                 1 means that the TOF sigmas are LOR dependent
+     *                                 any other value means that the first value in the sigma_tof
+     *                                 array is used for all LORs
+     *  @param lor_dependent_tofcenter_offset unsigned char 0 or 1
+     *                                        1 means that the TOF center offsets are LOR dependent
+     *                                        any other value means that the first value in the
+     *                                        tofcenter_offset array is used for all LORs
+     *  @param threadsperblock    number of threads per block
+     */
+    void joseph3d_fwd_tof_sino_cuda(const float *h_xstart,
+                                    const float *h_xend,
+                                    float **d_img,
+                                    const float *h_img_origin,
+                                    const float *h_voxsize,
+                                    float *h_p,
+                                    long long nlors,
+                                    const int *h_img_dim,
+                                    float tofbin_width,
+                                    const float *h_sigma_tof,
+                                    const float *h_tofcenter_offset,
+                                    float n_sigmas,
+                                    short n_tofbins,
+                                    unsigned char lor_dependent_sigma_tof,
+                                    unsigned char lor_dependent_tofcenter_offset,
+                                    int threadsperblock);
+
+    /** @brief copy a float array to all visible cuda devices
+     *
+     *  The number of visible cuda devices is determined automatically via the CUDA API
+     *
+     *  @param    h_array   array of shape [n] on the host
+     *  @param    n         number of array elements
+     *  @return   a pointer to all devices arrays
+     */
+    float **copy_float_array_to_all_devices(const float *h_array, long long n);
+
+    /** @brief free device array on all visible cuda devices
+     *
+     *  The number of visible cuda devices is determined automatically via the CUDA API
+     *
+     *  @param d_array a pointer to all devices arrays
+     */
+    void free_float_array_on_all_devices(float **d_array);
+
+    /** @brief sum multiple versions of an array on different devices on first device
+     *
+     *  The number of visible cuda devices is determined automatically via the CUDA API
+     *  This becomes usefule when multiple devices backproject into separate images.
+     *
+     *  @param d_array a pointer to all devices arrays
+     *  @param    n         number of array elements
+     */
+    void sum_float_arrays_on_first_device(float **d_array, long long n);
+
+    /** @brief copy a (summed) float array from first device back to host
+     *
+     *  The number of visible cuda devices is determined automatically via the CUDA API
+     *
+     *  @param  d_array   a pointer to all devices arrays of shape [n]
+     *  @param  n         number of array elements
+     *  @param  i_dev     device number
+     *  @param  h_array   array of shape [n] on the host used for output
+     */
+    void get_float_array_from_device(float **d_array, long long n, int i_dev, float *h_array);
+
+    /** @brief get the number of visible cuda devices
+     */
+    int get_cuda_device_count();
 
 #ifdef __cplusplus
-}  /* extern "C" */
+} /* extern "C" */
 #endif
 
 #endif
diff --git a/cuda/src/utils_cuda.cu b/cuda/src/utils_cuda.cu
index fa7d88e7..420fc1a6 100644
--- a/cuda/src/utils_cuda.cu
+++ b/cuda/src/utils_cuda.cu
@@ -2,60 +2,60 @@
  * @file utils_cuda.cu
  */
 
-#include<stdio.h>
-#include<omp.h>
+#include <stdio.h>
+#include <omp.h>
 
-extern "C" __global__ void add_to_first_kernel(float* a, float* b, unsigned long long n)
+extern "C" __global__ void add_to_first_kernel(float *a, float *b, unsigned long long n)
 {
-// add a vector b onto a vector a both of length n
+  // add a vector b onto a vector a both of length n
 
   unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if(i < n)
+  if (i < n)
   {
     a[i] += b[i];
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void print_int_device_array(int* a)
+extern "C" __global__ void print_int_device_array(int *a)
 {
   unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
   printf("%lld %d\n", i, a[i]);
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void print_float_device_array(float* a)
+extern "C" __global__ void print_float_device_array(float *a)
 {
   unsigned long long i = blockDim.x * blockIdx.x + threadIdx.x;
   printf("%lld %f\n", i, a[i]);
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" float** copy_float_array_to_all_devices(const float *h_array, long long n)
+extern "C" float **copy_float_array_to_all_devices(const float *h_array, long long n)
 {
-  cudaError_t error;  
+  cudaError_t error;
 
   // get number of visible devices
   int num_devices;
-  cudaGetDeviceCount(&num_devices);  
+  cudaGetDeviceCount(&num_devices);
 
   // create pointer to device arrays
-  float **d_array = new float * [num_devices];
+  float **d_array = new float *[num_devices];
 
-  long long array_bytes = n*sizeof(float);
+  long long array_bytes = n * sizeof(float);
 
-  # pragma omp parallel for schedule(static)
-  for (int i_dev = 0; i_dev < num_devices; i_dev++) 
+#pragma omp parallel for schedule(static)
+  for (int i_dev = 0; i_dev < num_devices; i_dev++)
   {
     cudaSetDevice(i_dev);
 
     error = cudaMalloc(&d_array[i_dev], array_bytes);
     if (error != cudaSuccess)
     {
-        printf("cudaMalloc returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), 
-               error, __LINE__);
-        exit(EXIT_FAILURE);
+      printf("cudaMalloc returned error %s (code %d), line(%d)\n", cudaGetErrorString(error),
+             error, __LINE__);
+      exit(EXIT_FAILURE);
     }
     cudaMemcpyAsync(d_array[i_dev], h_array, array_bytes, cudaMemcpyHostToDevice);
   }
@@ -74,8 +74,8 @@ extern "C" void free_float_array_on_all_devices(float **d_array)
   int num_devices;
   cudaGetDeviceCount(&num_devices);
 
-  # pragma omp parallel for schedule(static)
-  for (int i_dev = 0; i_dev < num_devices; i_dev++) 
+#pragma omp parallel for schedule(static)
+  for (int i_dev = 0; i_dev < num_devices; i_dev++)
   {
     cudaFree(d_array[i_dev]);
   }
@@ -88,7 +88,7 @@ extern "C" void free_float_array_on_all_devices(float **d_array)
 extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
 {
 
-  cudaError_t error;  
+  cudaError_t error;
   int threadsperblock = 32;
   dim3 block(threadsperblock);
   int blockspergrid = (int)ceil((float)n / threadsperblock);
@@ -99,21 +99,21 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
 
   float *d_array2;
 
-  long long array_bytes = n*sizeof(float);
+  long long array_bytes = n * sizeof(float);
 
-  if(num_devices > 1)
+  if (num_devices > 1)
   {
     cudaSetDevice(0);
 
-    for (int i_dev = 0; i_dev < num_devices; i_dev++) 
+    for (int i_dev = 0; i_dev < num_devices; i_dev++)
     {
-      if(i_dev == 0)
+      if (i_dev == 0)
       {
         // allocate memory for aux array to sum arrays on device 0
         error = cudaMalloc(&d_array2, array_bytes);
         if (error != cudaSuccess)
         {
-          printf("cudaMalloc returned error %s (code %d), line(%d)\n", 
+          printf("cudaMalloc returned error %s (code %d), line(%d)\n",
                  cudaGetErrorString(error), error, __LINE__);
           exit(EXIT_FAILURE);
         }
@@ -125,12 +125,12 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
         cudaMemcpyPeer(d_array2, 0, d_array[i_dev], i_dev, array_bytes);
 
         // call summation kernel to add d_array2 to d_array on device 0
-        add_to_first_kernel<<<grid,block>>>(d_array[0], d_array2, n);
+        add_to_first_kernel<<<grid, block>>>(d_array[0], d_array2, n);
       }
 
       cudaDeviceSynchronize();
     }
-    
+
     cudaFree(d_array2);
   }
 }
@@ -139,5 +139,17 @@ extern "C" void sum_float_arrays_on_first_device(float **d_array, long long n)
 extern "C" void get_float_array_from_device(float **d_array, long long n, int i_dev, float *h_array)
 {
   cudaSetDevice(i_dev);
-  cudaMemcpy(h_array, d_array[i_dev], n*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_array, d_array[i_dev], n * sizeof(float), cudaMemcpyDeviceToHost);
 }
+
+////////////////////////////////////////////////////////////////////////////////////////////
+extern "C" int get_cuda_device_count()
+{
+  int num_devices = 0;
+  cudaError_t err = cudaGetDeviceCount(&num_devices);
+
+  if (err != cudaSuccess)
+    num_devices = 0;
+
+  return num_devices;
+}
\ No newline at end of file
diff --git a/python/parallelproj/__init__.py b/python/parallelproj/__init__.py
index df645737..147403cf 100644
--- a/python/parallelproj/__init__.py
+++ b/python/parallelproj/__init__.py
@@ -1,4 +1,5 @@
-from .config import cuda_enabled, cupy_enabled, get_array_module, XPArray, XPFloat32Array, XPShortArray
+from .backend import cuda_present, cupy_enabled, get_array_module, XPArray, XPFloat32Array, XPShortArray
+from .backend import num_visible_cuda_devices
 from .backend import joseph3d_fwd, joseph3d_back
 from .backend import joseph3d_fwd_tof_sino, joseph3d_back_tof_sino
 from .backend import joseph3d_fwd_tof_lm, joseph3d_back_tof_lm
\ No newline at end of file
diff --git a/python/parallelproj/backend.py b/python/parallelproj/backend.py
index cd6b4434..289f29d9 100644
--- a/python/parallelproj/backend.py
+++ b/python/parallelproj/backend.py
@@ -1,4 +1,6 @@
 import os
+import importlib
+import distutils
 import math
 
 import ctypes
@@ -10,8 +12,28 @@
 
 import numpy as np
 import numpy.ctypeslib as npct
+import numpy.typing as npt
 
-from parallelproj.config import cuda_enabled, cupy_enabled, XPShortArray, XPFloat32Array, get_array_module
+from typing import Union
+from types import ModuleType
+
+# check if cuda is present
+cuda_present = distutils.spawn.find_executable('nvidia-smi') is not None
+
+# check if cupy is available
+cupy_enabled = (importlib.util.find_spec('cupy') is not None)
+
+# define type for cupy or numpy array
+if cupy_enabled:
+    import cupy as cp
+    import cupy.typing as cpt
+    XPArray = Union[npt.NDArray, cpt.NDArray]
+    XPFloat32Array = Union[npt.NDArray[np.float32], cpt.NDArray[np.float32]]
+    XPShortArray = Union[npt.NDArray[np.int16], cpt.NDArray[np.int16]]
+else:
+    XPArray = npt.NDArray
+    XPFloat32Array = npt.NDArray[np.float32]
+    XPShortArray = npt.NDArray[np.int16]
 
 # numpy ctypes lib array definitions
 ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags='C')
@@ -137,7 +159,7 @@
 
 #---------------------------------------------------------------------------------------
 
-if cuda_enabled:
+if cuda_present:
     if 'PARALLELPROJ_CUDA_LIB' in os.environ:
         lib_parallelproj_cuda_fname = os.environ['PARALLELPROJ_CUDA_LIB']
     else:
@@ -152,6 +174,14 @@
             os.path.basename(lib_parallelproj_cuda_fname),
             os.path.dirname(lib_parallelproj_cuda_fname))
 
+        # get the number of visible cuda devices
+        lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
+        num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count(
+        )
+
+        if (num_visible_cuda_devices == 0) and cupy_enabled:
+            cupy_enabled = False
+
         lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
         lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
             ar_1d_single,  # h_xstart
@@ -387,7 +417,7 @@ def joseph3d_fwd(xstart: XPFloat32Array,
              xp.asarray(img_dim)))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -461,7 +491,7 @@ def joseph3d_back(xstart: XPFloat32Array,
              np.int64(nLORs), xp.asarray(img_dim)))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -569,7 +599,7 @@ def joseph3d_fwd_tof_sino(xstart: XPFloat32Array,
              lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -689,7 +719,7 @@ def joseph3d_back_tof_sino(xstart: XPFloat32Array,
              lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -815,7 +845,7 @@ def joseph3d_fwd_tof_lm(xstart: XPFloat32Array,
              lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -933,7 +963,7 @@ def joseph3d_back_tof_lm(xstart: XPFloat32Array,
              lor_dependent_tofcenter_offset))
         xp.cuda.Device().synchronize()
     else:
-        if cuda_enabled:
+        if num_visible_cuda_devices > 0:
             # back projection of numpy array using the cuda parallelproj lib
             num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
 
@@ -982,7 +1012,28 @@ def joseph3d_back_tof_lm(xstart: XPFloat32Array,
         else:
             # back projection of numpy array using the openmp parallelproj lib
             lib_parallelproj_c.joseph3d_back_tof_lm(
-                xstart.ravel(), xend.ravel(), back_img.ravel(), img_origin,
-                voxsize, lst, np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
+                xstart.ravel(),
+                xend.ravel(), back_img.ravel(), img_origin, voxsize, lst,
+                np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
                 tofcenter_offset, nsigmas, tofbin, lor_dependent_sigma_tof,
-                lor_dependent_tofcenter_offset)
\ No newline at end of file
+                lor_dependent_tofcenter_offset)
+
+
+#-----------------------------------------------------------------------------
+
+
+def get_array_module(array) -> ModuleType:
+    """return module of a cupy or numpy array
+
+    Parameters
+    ----------
+    array : cupy or numpy array
+
+    Returns
+    -------
+    cupy or numpy module
+    """
+    if cupy_enabled:
+        return cp.get_array_module(array)
+    else:
+        return np
\ No newline at end of file
diff --git a/python/parallelproj/config.py b/python/parallelproj/config.py
deleted file mode 100644
index 285fc403..00000000
--- a/python/parallelproj/config.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""package configurations"""
-import importlib
-import GPUtil
-import numpy as np
-import numpy.typing as npt
-
-from typing import Union
-from types import ModuleType
-
-# number of available CUDA devices
-num_available_cuda_devices = len(GPUtil.getGPUs())
-cuda_enabled = (num_available_cuda_devices > 0)
-# check if cupy is available
-cupy_enabled = (importlib.util.find_spec('cupy') is not None)
-
-# define type for cupy or numpy array
-if cuda_enabled:
-    import cupy as cp
-    import cupy.typing as cpt
-    XPArray = Union[npt.NDArray, cpt.NDArray]
-    XPFloat32Array = Union[npt.NDArray[np.float32], cpt.NDArray[np.float32]]
-    XPShortArray = Union[npt.NDArray[np.int16], cpt.NDArray[np.int16]]
-else:
-    XPArray = npt.NDArray
-    XPFloat32Array = npt.NDArray[np.float32]
-    XPShortArray = npt.NDArray[np.int16]
-
-
-def get_array_module(array) -> ModuleType:
-    """return module of a cupy or numpy array
-
-    Parameters
-    ----------
-    array : cupy or numpy array
-
-    Returns
-    -------
-    cupy or numpy module
-    """
-    if cupy_enabled:
-        return cp.get_array_module(array)
-    else:
-        return np
\ No newline at end of file
diff --git a/python/test/test_nontof_joseph.py b/python/test/test_nontof_joseph.py
index fac79886..de09d710 100644
--- a/python/test/test_nontof_joseph.py
+++ b/python/test/test_nontof_joseph.py
@@ -64,7 +64,7 @@ def fwd_test(xp: ModuleType, verbose=True) -> bool:
 
     if verbose:
         print(
-            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.cuda_enabled}'
+            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.num_visible_cuda_devices > 0}'
         )
         print('calculated projection = ', img_fwd)
         print('expected   projection = ', expected_projections)
@@ -134,7 +134,7 @@ def adjointness_test(xp: ModuleType,
 
     if verbose:
         print(
-            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.cuda_enabled}'
+            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.num_visible_cuda_devices > 0}'
         )
         print('ip_a = ', ip_a)
         print('ip_b = ', ip_b)
@@ -157,11 +157,6 @@ def test_fwd(self):
             import cupy as cp
             self.assertTrue(fwd_test(cp))
 
-        if parallelproj.cuda_enabled:
-            parallelproj.cuda_enabled = False
-            self.assertTrue(fwd_test(np))
-            parallelproj.cuda_enabled = True
-
     def test_adjoint(self):
         """test non TOF joseph forward projection using different backends"""
         self.assertTrue(adjointness_test(np))
@@ -170,11 +165,6 @@ def test_adjoint(self):
             import cupy as cp
             self.assertTrue(adjointness_test(cp))
 
-        if parallelproj.cuda_enabled:
-            parallelproj.cuda_enabled = False
-            self.assertTrue(adjointness_test(np))
-            parallelproj.cuda_enabled = True
-
 
 #--------------------------------------------------------------------------
 
diff --git a/python/test/test_toflm_joseph.py b/python/test/test_toflm_joseph.py
index be825833..8536785e 100644
--- a/python/test/test_toflm_joseph.py
+++ b/python/test/test_toflm_joseph.py
@@ -69,7 +69,7 @@ def tof_lm_fwd_test(xp: ModuleType, verbose: bool = True) -> None:
 
     if verbose:
         print(
-            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.cuda_enabled}'
+            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.num_visible_cuda_devices > 0}'
         )
         print(
             f'sum of TOF profile / expected:    {float(img_fwd.sum()):.4E} / {voxsize:.4E}'
@@ -147,7 +147,7 @@ def adjointness_test(xp: ModuleType,
 
     if verbose:
         print(
-            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.cuda_enabled}'
+            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.num_visible_cuda_devices > 0}'
         )
         print('ip_a = ', ip_a)
         print('ip_b = ', ip_b)
@@ -170,11 +170,6 @@ def test_adjoint(self):
             import cupy as cp
             self.assertTrue(adjointness_test(cp))
 
-        #if parallelproj.cuda_enabled:
-        #    parallelproj.cuda_enabled = False
-        #    self.assertTrue(adjointness_test(np))
-        #    parallelproj.cuda_enabled = True
-
     def test_forward(self):
         """test TOF joseph forward projection using different backends"""
         self.assertTrue(tof_lm_fwd_test(np))
@@ -183,11 +178,6 @@ def test_forward(self):
             import cupy as cp
             self.assertTrue(tof_lm_fwd_test(cp))
 
-        #if parallelproj.cuda_enabled:
-        #    parallelproj.cuda_enabled = False
-        #    self.assertTrue(tof_sino_fwd_test(np))
-        #    parallelproj.cuda_enabled = True
-
 
 #--------------------------------------------------------------------------
 
diff --git a/python/test/test_tofsino_joseph.py b/python/test/test_tofsino_joseph.py
index f6c9d47e..471d87bd 100644
--- a/python/test/test_tofsino_joseph.py
+++ b/python/test/test_tofsino_joseph.py
@@ -67,7 +67,7 @@ def tof_sino_fwd_test(xp: ModuleType, verbose: bool = True) -> None:
 
     if verbose:
         print(
-            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.cuda_enabled}'
+            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.num_visible_cuda_devices > 0}'
         )
         print(
             f'sum of TOF profile / expected:    {float(img_fwd.sum()):.4E} / {voxsize:.4E}'
@@ -145,7 +145,7 @@ def adjointness_test(xp: ModuleType,
 
     if verbose:
         print(
-            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.cuda_enabled}'
+            f'module = {xp.__name__}  -  cuda_enabled {parallelproj.num_visible_cuda_devices > 0}'
         )
         print('ip_a = ', ip_a)
         print('ip_b = ', ip_b)
@@ -168,11 +168,6 @@ def test_adjoint(self):
             import cupy as cp
             self.assertTrue(adjointness_test(cp))
 
-        #if parallelproj.cuda_enabled:
-        #    parallelproj.cuda_enabled = False
-        #    self.assertTrue(adjointness_test(np))
-        #    parallelproj.cuda_enabled = True
-
     def test_forward(self):
         """test TOF joseph forward projection using different backends"""
         self.assertTrue(tof_sino_fwd_test(np))
@@ -181,11 +176,6 @@ def test_forward(self):
             import cupy as cp
             self.assertTrue(tof_sino_fwd_test(cp))
 
-        #if parallelproj.cuda_enabled:
-        #    parallelproj.cuda_enabled = False
-        #    self.assertTrue(tof_sino_fwd_test(np))
-        #    parallelproj.cuda_enabled = True
-
 
 #--------------------------------------------------------------------------