safeguarding all custom CUDA and C++ routines via the _cuda_extension… (

#54) * safeguarding all custom CUDA and C++ routines via the _cuda_extension_available flag * bumping up version number
NVIDIA · Oct 1, 2024 · 663bea1 · 663bea1
1 parent 4fea88b
commit 663bea1
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 25 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -7,6 +7,7 @@
 * Added resampling modules for convenience
 * Changing behavior of distributed SHT to use `dim=-3` as channel dimension
 * Fixing SHT unittests to test SHT and ISHT individually, rather than the roundtrip
+* Changing the way custom CUDA extensions are handled
 
 ### v0.7.1
 

diff --git a/README.md b/README.md
@@ -75,16 +75,16 @@ torch-harmonics has been used to implement a variety of differentiable PDE solve
 
 
 ## Installation
-Download directly from PyPI:
+A simple installation can be directly done from PyPI:
 
 ```bash
 pip install torch-harmonics
 ```
-If you would like to enforce the compilation of CUDA extensions for the discrete-continuous convolutions, you can do so by setting the `FORCE_CUDA_EXTENSION` flag. You may also want to set appropriate architectures with the `TORCH_CUDA_ARCH_LIST` flag.
+If you are planning to use spherical convolutions, we recommend building the corresponding custom CUDA kernels. To enforce this, you can set the `FORCE_CUDA_EXTENSION` flag. You may also want to set appropriate architectures with the `TORCH_CUDA_ARCH_LIST` flag. Finally, make sure to disable build isolation via the `--no-build-isolation` flag to ensure that the custom kernels are built with the existing torch installation.
 ```bash
 export FORCE_CUDA_EXTENSION=1
 export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
-pip install torch-harmonics
+pip install --no-build-isolation torch-harmonics
 ```
 :warning: Please note that the custom CUDA extensions currently only support CUDA architectures >= 7.0.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = [ "setuptools", "setuptools-scm", "torch>=2.4.0"]
+requires = [ "setuptools", "setuptools-scm"]
 build-backend = "setuptools.build_meta"
 
 [project]

diff --git a/torch_harmonics/__init__.py b/torch_harmonics/__init__.py
@@ -29,7 +29,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-__version__ = "0.7.1"
+__version__ = "0.7.2"
 
 from .sht import RealSHT, InverseRealSHT, RealVectorSHT, InverseRealVectorSHT
 from .convolution import DiscreteContinuousConvS2, DiscreteContinuousConvTransposeS2

diff --git a/torch_harmonics/convolution.py b/torch_harmonics/convolution.py
@@ -44,12 +44,10 @@
 from torch_harmonics._disco_convolution import _disco_s2_contraction_torch, _disco_s2_transpose_contraction_torch
 from torch_harmonics._disco_convolution import _disco_s2_contraction_cuda, _disco_s2_transpose_contraction_cuda
 
-# import custom C++/CUDA extensions
-from disco_helpers import preprocess_psi
-
+# import custom C++/CUDA extensions if available
 try:
+    from disco_helpers import preprocess_psi
     import disco_cuda_extension
-
     _cuda_extension_available = True
 except ImportError as err:
     disco_cuda_extension = None
@@ -377,10 +375,13 @@ def __init__(
         row_idx = idx[1, ...].contiguous()
         col_idx = idx[2, ...].contiguous()
         vals = vals.contiguous()
-        roff_idx = preprocess_psi(self.kernel_size, out_shape[0], ker_idx, row_idx, col_idx, vals).contiguous()
 
-        # preprocessed data-structure for GPU kernel
-        self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+        if _cuda_extension_available:
+            # preprocessed data-structure for GPU kernel
+            roff_idx = preprocess_psi(self.kernel_size, out_shape[0], ker_idx, row_idx, col_idx, vals).contiguous()
+            self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+
+        # save all datastructures
         self.register_buffer("psi_ker_idx", ker_idx, persistent=False)
         self.register_buffer("psi_row_idx", row_idx, persistent=False)
         self.register_buffer("psi_col_idx", col_idx, persistent=False)
@@ -468,10 +469,13 @@ def __init__(
         row_idx = idx[1, ...].contiguous()
         col_idx = idx[2, ...].contiguous()
         vals = vals.contiguous()
-        roff_idx = preprocess_psi(self.kernel_size, in_shape[0], ker_idx, row_idx, col_idx, vals).contiguous()
 
-        # preprocessed data-structure for GPU kernel
-        self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+        if _cuda_extension_available:
+            # preprocessed data-structure for GPU kernel
+            roff_idx = preprocess_psi(self.kernel_size, in_shape[0], ker_idx, row_idx, col_idx, vals).contiguous()
+            self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+
+        # save all datastructures
         self.register_buffer("psi_ker_idx", ker_idx, persistent=False)
         self.register_buffer("psi_row_idx", row_idx, persistent=False)
         self.register_buffer("psi_col_idx", col_idx, persistent=False)

diff --git a/torch_harmonics/distributed/distributed_convolution.py b/torch_harmonics/distributed/distributed_convolution.py
@@ -58,12 +58,10 @@
 from torch_harmonics.distributed import polar_group_rank, azimuth_group_rank
 from torch_harmonics.distributed import compute_split_shapes, split_tensor_along_dim
 
-# import custom C++/CUDA extensions
-from disco_helpers import preprocess_psi
-
+# import custom C++/CUDA extensions if available
 try:
+    from disco_helpers import preprocess_psi
     import disco_cuda_extension
-
     _cuda_extension_available = True
 except ImportError as err:
     disco_cuda_extension = None
@@ -240,10 +238,12 @@ def __init__(
         row_idx = idx[1, ...].contiguous()
         col_idx = idx[2, ...].contiguous()
         vals = vals.contiguous()
-        roff_idx = preprocess_psi(self.kernel_size, self.nlat_out_local, ker_idx, row_idx, col_idx, vals).contiguous()
 
-        # preprocessed data-structure for GPU kernel
-        self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+        if _cuda_extension_available:
+            # preprocessed data-structure for GPU kernel
+            roff_idx = preprocess_psi(self.kernel_size, self.nlat_out_local, ker_idx, row_idx, col_idx, vals).contiguous()
+            self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+
         self.register_buffer("psi_ker_idx", ker_idx, persistent=False)
         self.register_buffer("psi_row_idx", row_idx, persistent=False)
         self.register_buffer("psi_col_idx", col_idx, persistent=False)
@@ -370,10 +370,12 @@ def __init__(
         row_idx = idx[1, ...].contiguous()
         col_idx = idx[2, ...].contiguous()
         vals = vals.contiguous()
-        roff_idx = preprocess_psi(self.kernel_size, self.nlat_in_local, ker_idx, row_idx, col_idx, vals).contiguous()
 
-        # preprocessed data-structure for GPU kernel
-        self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+        if _cuda_extension_available:
+            # preprocessed data-structure for GPU kernel
+            roff_idx = preprocess_psi(self.kernel_size, self.nlat_in_local, ker_idx, row_idx, col_idx, vals).contiguous()
+            self.register_buffer("psi_roff_idx", roff_idx, persistent=False)
+
         self.register_buffer("psi_ker_idx", ker_idx, persistent=False)
         self.register_buffer("psi_row_idx", row_idx, persistent=False)
         self.register_buffer("psi_col_idx", col_idx, persistent=False)