From 3dacda2ecb72c12f97c9a579343ef6c7ecfdca35 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Wed, 6 Dec 2023 13:01:07 +0100
Subject: [PATCH 01/20] poc/ntt-cuda/cuda/ntt_api.cu: resolve clang warning.

---
 poc/ntt-cuda/cuda/ntt_api.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/poc/ntt-cuda/cuda/ntt_api.cu b/poc/ntt-cuda/cuda/ntt_api.cu
index 3ca32fe..38f4d78 100644
--- a/poc/ntt-cuda/cuda/ntt_api.cu
+++ b/poc/ntt-cuda/cuda/ntt_api.cu
@@ -27,10 +27,11 @@
 #ifndef __CUDA_ARCH__
 
 extern "C"
-RustError compute_ntt(size_t device_id, fr_t* inout, uint32_t lg_domain_size,
-                      NTT::InputOutputOrder ntt_order,
-                      NTT::Direction ntt_direction,
-                      NTT::Type ntt_type)
+RustError::by_value compute_ntt(size_t device_id,
+                                fr_t* inout, uint32_t lg_domain_size,
+                                NTT::InputOutputOrder ntt_order,
+                                NTT::Direction ntt_direction,
+                                NTT::Type ntt_type)
 {
     auto& gpu = select_gpu(device_id);
 

From 71a96567675847b1e5e372cf53ae3731abe4a733 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Wed, 6 Dec 2023 13:01:47 +0100
Subject: [PATCH 02/20] ff/baby_bear.hpp: resolve clang compilation failure.

clang apparently casts the operator-matching net too wide and confuses
bb31_t x^pow, where pow is unsigned, with operator^(int,int). Just in
case, recall that for the time being baby_bear.hpp facilitates syntax
vetting by the host compiler, the CUDA code is generated by cicc.
---
 ff/baby_bear.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ff/baby_bear.hpp b/ff/baby_bear.hpp
index cce438a..c7439f0 100644
--- a/ff/baby_bear.hpp
+++ b/ff/baby_bear.hpp
@@ -27,18 +27,18 @@ class bb31_t {
     // this is used in constant declaration, e.g. as bb31_t{11}
     inline constexpr bb31_t(int a) : val(((uint64_t)a << 32) % MOD) {}
 
-    static inline const bb31_t one()            { return bb31_t(1); }
-    inline bb31_t& operator+=(bb31_t b)         { return *this;     }
-    inline bb31_t& operator-=(bb31_t b)         { return *this;     }
-    inline bb31_t& operator*=(bb31_t b)         { return *this;     }
-    inline bb31_t& operator^=(int b)            { return *this;     }
-    inline bb31_t& sqr()                        { return *this;     }
-    friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b;    }
-    friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b;    }
-    friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b;    }
-    friend bb31_t operator^(bb31_t a, int b)    { return a ^= b;    }
-    inline void zero()                          { val = 0;          }
-    inline bool is_zero() const                 { return val==0;    }
+    static inline const bb31_t one()                { return bb31_t(1); }
+    inline bb31_t& operator+=(bb31_t b)             { return *this;     }
+    inline bb31_t& operator-=(bb31_t b)             { return *this;     }
+    inline bb31_t& operator*=(bb31_t b)             { return *this;     }
+    inline bb31_t& operator^=(int b)                { return *this;     }
+    inline bb31_t& sqr()                            { return *this;     }
+    friend bb31_t operator+(bb31_t a, bb31_t b)     { return a += b;    }
+    friend bb31_t operator-(bb31_t a, bb31_t b)     { return a -= b;    }
+    friend bb31_t operator*(bb31_t a, bb31_t b)     { return a *= b;    }
+    friend bb31_t operator^(bb31_t a, uint32_t b)   { return a ^= b;    }
+    inline void zero()                              { val = 0;          }
+    inline bool is_zero() const                     { return val==0;    }
     inline operator uint32_t() const
     {   return ((val*M)*(uint64_t)MOD + val) >> 32;  }
     inline void to()    { val = ((uint64_t)val<<32) % MOD;  }

From 62ba663b73b2f6b7a0ff5f02683bcbfa9c30d0d5 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Wed, 6 Dec 2023 13:15:49 +0100
Subject: [PATCH 03/20] Add .github/workflows/ci.yml to exercise PoCs' build
 procedures.

---
 .github/workflows/ci.yml | 83 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..17a213a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,83 @@
+name: build
+
+on:
+  push:
+    branches:
+    - '**'
+  workflow_dispatch:
+    branches:
+    - '**'
+  pull_request:
+    branches:
+    - main
+
+jobs:
+  cargo-test-no-run:
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Get date
+      id: get-date
+      run: echo "date=$(date -u +%Y-%m)" >> $GITHUB_OUTPUT
+      shell: bash
+
+    - uses: actions/cache@v3
+      with:
+        path: |
+          ~/.cargo/registry
+          **/Cargo.lock
+          **/target
+          /usr/local/cuda-12.3
+        key: ${{ runner.os }}-cargo-${{ steps.get-date.outputs.date }}
+
+    - name: Environment
+      shell: bash
+      run: |
+        lscpu 2>/dev/null && echo --- || true
+        env | sort
+
+    - name: Install cuda-minimal-build-12-3
+      shell: bash
+      run: |
+        if [ ! -d /usr/local/cuda-12.3 ]; then
+            # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt-get update
+            sudo apt-get -y install cuda-minimal-build-12-3
+        fi
+        [ -d /usr/local/cuda-12.3/bin ]
+
+    - name: The test
+      shell: bash
+      run: |
+        rustc --version --verbose
+        export PATH=$PATH:/usr/local/cuda-12.3/bin
+        ( cd poc/ntt-cuda
+          cargo test --release --no-run --features=bls12_381
+          cargo test --release --no-run --features=gl64
+          cargo test --release --no-run --features=bb31
+          export CXX=clang++
+          cargo test --release --no-run --features=bls12_381
+          cargo test --release --no-run --features=gl64
+          cargo test --release --no-run --features=bb31
+          cargo clean -p ntt-cuda
+          cargo clean -p ntt-cuda --release
+        )
+        ( cd poc/msm-cuda
+          cargo test --release --no-run --features=bls12_381
+          cargo test --release --no-run --features=bn254
+          export CXX=clang++
+          cargo test --release --no-run --features=bls12_381
+          cargo test --release --no-run --features=bn254
+          cargo clean -p msm-cuda
+          cargo clean -p msm-cuda --release
+        )
+        rm -rf poc/*/target/.rustc_info.json
+        rm -rf poc/*/target/package
+        rm -rf poc/*/target/{debug,release}/incremental
+        rm -rf poc/*/target/*/{debug,release}/incremental
+        rm -rf ~/.cargo/registry/src
+        rm -rf ~/.cargo/registry/index/*/.cache

From 121c757ff31f0a4fa9a0920b4876a28431875c91 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Wed, 6 Dec 2023 13:28:25 +0100
Subject: [PATCH 04/20] poc/ntt-cuda/Cargo.toml: drop unused dev-dependencies.

---
 .github/workflows/ci.yml | 77 ++++++++++++++++++++++++++--------------
 poc/ntt-cuda/Cargo.toml  |  2 --
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 17a213a..cea979d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,4 +1,4 @@
-name: build
+name: test-build
 
 on:
   push:
@@ -29,7 +29,6 @@ jobs:
           ~/.cargo/registry
           **/Cargo.lock
           **/target
-          /usr/local/cuda-12.3
         key: ${{ runner.os }}-cargo-${{ steps.get-date.outputs.date }}
 
     - name: Environment
@@ -41,43 +40,69 @@ jobs:
     - name: Install cuda-minimal-build-12-3
       shell: bash
       run: |
-        if [ ! -d /usr/local/cuda-12.3 ]; then
-            # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
-            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-            sudo dpkg -i cuda-keyring_1.1-1_all.deb
-            sudo apt-get update
-            sudo apt-get -y install cuda-minimal-build-12-3
-        fi
+        # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        sudo apt-get update
+        sudo apt-get -y install cuda-minimal-build-12-3
         [ -d /usr/local/cuda-12.3/bin ]
 
-    - name: The test
+    - name: Test-build poc/ntt-cuda
       shell: bash
       run: |
         rustc --version --verbose
         export PATH=$PATH:/usr/local/cuda-12.3/bin
         ( cd poc/ntt-cuda
-          cargo test --release --no-run --features=bls12_381
-          cargo test --release --no-run --features=gl64
-          cargo test --release --no-run --features=bb31
-          export CXX=clang++
-          cargo test --release --no-run --features=bls12_381
-          cargo test --release --no-run --features=gl64
-          cargo test --release --no-run --features=bb31
+          cargo update
+          cargo test --no-run --release --features=bls12_381
+          cargo test --no-run --release --features=gl64
+          cargo test --no-run --release --features=bb31
+          if which clang++ 2>/dev/null; then
+              echo
+              echo Testing with clang++
+              echo
+              clang++ --version
+              echo
+              export CXX=clang++
+              cargo test --no-run --release --features=bls12_381
+              cargo test --no-run --release --features=gl64
+              cargo test --no-run --release --features=bb31
+          fi
           cargo clean -p ntt-cuda
           cargo clean -p ntt-cuda --release
+          rm -rf target/.rustc_info.json
+          rm -rf target/package
+          rm -rf target/{debug,release}/incremental
+          rm -rf target/*/{debug,release}/incremental
         )
+
+    - name: Test-build poc/msm-cuda
+      shell: bash
+      run: |
+        rustc --version --verbose
+        export PATH=$PATH:/usr/local/cuda-12.3/bin
         ( cd poc/msm-cuda
-          cargo test --release --no-run --features=bls12_381
-          cargo test --release --no-run --features=bn254
-          export CXX=clang++
-          cargo test --release --no-run --features=bls12_381
-          cargo test --release --no-run --features=bn254
+          sed "s/^crit/#crit/" Cargo.toml > Cargo.$$.toml && \
+          mv Cargo.$$.toml Cargo.toml
+          cargo update
+          cargo test --no-run --release --features=bls12_381,quiet
+          cargo test --no-run --release --features=bn254,quiet
+          if which clang++ 2>/dev/null; then
+              echo
+              echo Testing with clang++
+              echo
+              clang++ --version
+              echo
+              export CXX=clang++
+              cargo test --no-run --release --features=bls12_381,quiet
+              cargo test --no-run --release --features=bn254,quiet
+          fi
           cargo clean -p msm-cuda
           cargo clean -p msm-cuda --release
+          rm -rf target/.rustc_info.json
+          rm -rf target/package
+          rm -rf target/{debug,release}/incremental
+          rm -rf target/*/{debug,release}/incremental
         )
-        rm -rf poc/*/target/.rustc_info.json
-        rm -rf poc/*/target/package
-        rm -rf poc/*/target/{debug,release}/incremental
-        rm -rf poc/*/target/*/{debug,release}/incremental
         rm -rf ~/.cargo/registry/src
         rm -rf ~/.cargo/registry/index/*/.cache
diff --git a/poc/ntt-cuda/Cargo.toml b/poc/ntt-cuda/Cargo.toml
index fdd1822..3498b5d 100644
--- a/poc/ntt-cuda/Cargo.toml
+++ b/poc/ntt-cuda/Cargo.toml
@@ -32,8 +32,6 @@ sppark = { path = "../../rust" }
 cc = "^1.0.70"
 
 [dev-dependencies]
-criterion = { version = "0.3", features = [ "html_reports" ] }
-rayon = "1.5"
 rand = "^0"
 ark-std = "0.3.0"
 ark-ff = "0.3.0"

From b8d0005a83f92c0a55a8fe0d8ad0d8f5968a8c30 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 28 Nov 2023 13:37:48 +0100
Subject: [PATCH 05/20] ntt/kernels/*_wide.cu: omit one multiplication.

---
 ntt/kernels/ct_mixed_radix_wide.cu | 3 +--
 ntt/kernels/gs_mixed_radix_wide.cu | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu
index 11366ad..efa1a58 100644
--- a/ntt/kernels/ct_mixed_radix_wide.cu
+++ b/ntt/kernels/ct_mixed_radix_wide.cu
@@ -43,12 +43,11 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         unsigned int nbits = MAX_LG_DOMAIN_SIZE - stage;
 
         index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos;
-        index_t root_idx1 = thread_ntt_pos << (nbits - 1);
+        index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1));
 
         fr_t first_root, second_root;
         get_intermediate_roots(first_root, second_root,
                                root_idx0, root_idx1, d_partial_twiddles);
-        second_root *= first_root;
 
         r0 *= first_root;
         r1 *= second_root;
diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu
index f5cd4bb..b6b94d4 100644
--- a/ntt/kernels/gs_mixed_radix_wide.cu
+++ b/ntt/kernels/gs_mixed_radix_wide.cu
@@ -89,12 +89,11 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         unsigned int nbits = MAX_LG_DOMAIN_SIZE - (stage - iterations);
 
         index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos;
-        index_t root_idx1 = thread_ntt_pos << (nbits - 1);
+        index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1));
 
         fr_t first_root, second_root;
         get_intermediate_roots(first_root, second_root,
                                root_idx0, root_idx1, d_partial_twiddles);
-        second_root *= first_root;
 
         r0 *= first_root;
         r1 *= second_root;

From 8ae27a6ce4586323eb383031c317bd4ec88c4895 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 28 Nov 2023 13:38:32 +0100
Subject: [PATCH 06/20] ntt/kernels.cu: optimize some roots' calculations.

---
 ntt/kernels.cu | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/ntt/kernels.cu b/ntt/kernels.cu
index 208c961..c45306a 100644
--- a/ntt/kernels.cu
+++ b/ntt/kernels.cu
@@ -142,7 +142,31 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE],
 {
     unsigned int off = 0;
 
-    fr_t t, root = roots[off][pow % WINDOW_SIZE];
+    fr_t t, root;
+
+    if (sizeof(fr_t) <= 8) {
+        root = fr_t::one();
+        bool root_set = false;
+
+        #pragma unroll
+        for (unsigned int pow_win, i = 0; i < WINDOW_NUM; i++) {
+            if (!root_set && (pow_win = pow % WINDOW_SIZE)) {
+                root = roots[i][pow_win];
+                root_set = true;
+            }
+            if (!root_set) {
+                pow >>= LG_WINDOW_SIZE;
+                off++;
+            }
+        }
+    } else {
+        if ((pow % WINDOW_SIZE) == 0) {
+            pow >>= LG_WINDOW_SIZE;
+            off++;
+        }
+        root = roots[off][pow % WINDOW_SIZE];
+    }
+
     #pragma unroll 1
     while (pow >>= LG_WINDOW_SIZE)
         root *= (t = roots[++off][pow % WINDOW_SIZE]);
@@ -252,13 +276,16 @@ void get_intermediate_roots(fr_t& root0, fr_t& root1,
 {
     int win = (WINDOW_NUM - 1) * LG_WINDOW_SIZE;
     int off = (WINDOW_NUM - 1);
+    index_t idxo = idx0 | idx1;
+    index_t mask = ((index_t)1 << win) - 1;
 
     root0 = roots[off][idx0 >> win];
     root1 = roots[off][idx1 >> win];
     #pragma unroll 1
-    while (off--) {
+    while (off-- && (idxo & mask)) {
         fr_t t;
         win -= LG_WINDOW_SIZE;
+        mask >>= LG_WINDOW_SIZE;
         root0 *= (t = roots[off][(idx0 >> win) % WINDOW_SIZE]);
         root1 *= (t = roots[off][(idx1 >> win) % WINDOW_SIZE]);
     }

From 62045e72d1bd57f8f28f9ff4d6b8003e8f7e129d Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Mon, 27 Nov 2023 17:55:30 +0100
Subject: [PATCH 07/20] ntt/{kernels.cu,ntt.cuh}: omit redundant argument.

In the LDE_powers() case the operation is fully described by |lg_blowup|
parameter, and in the NTT_internal() case - by the |type| parameter.
---
 ntt/kernels.cu |  6 ++----
 ntt/ntt.cuh    | 43 +++++++++++++++++--------------------------
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/ntt/kernels.cu b/ntt/kernels.cu
index c45306a..3d78821 100644
--- a/ntt/kernels.cu
+++ b/ntt/kernels.cu
@@ -176,8 +176,7 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE],
 
 __launch_bounds__(1024) __global__
 void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev,
-                           const fr_t (*gen_powers)[WINDOW_SIZE],
-                           bool ext_pow = false)
+                           const fr_t (*gen_powers)[WINDOW_SIZE])
 {
     index_t idx = threadIdx.x + blockDim.x * (index_t)blockIdx.x;
     index_t pow = idx;
@@ -191,8 +190,7 @@ void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev,
         pow = bit_rev(idx, lg_domain_size);
     }
 
-    if (ext_pow)
-        pow <<= lg_blowup;
+    pow <<= lg_blowup;
 
     r = r * get_intermediate_root(pow, gen_powers);
 
diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh
index 734a9a5..ca4fcd5 100644
--- a/ntt/ntt.cuh
+++ b/ntt/ntt.cuh
@@ -58,7 +58,7 @@ protected:
 private:
     static void LDE_powers(fr_t* inout, bool innt, bool bitrev,
                            uint32_t lg_domain_size, uint32_t lg_blowup,
-                           stream_t& stream, bool ext_pow = false)
+                           stream_t& stream)
     {
         size_t domain_size = (size_t)1 << lg_domain_size;
         const auto gen_powers =
@@ -66,23 +66,21 @@ private:
 
         if (domain_size < WARP_SZ)
             LDE_distribute_powers<<<1, domain_size, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers, ext_pow);
+                                 (inout, lg_blowup, bitrev, gen_powers);
         else if (domain_size < 512)
             LDE_distribute_powers<<<domain_size / WARP_SZ, WARP_SZ, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers, ext_pow);
+                                 (inout, lg_blowup, bitrev, gen_powers);
         else
             LDE_distribute_powers<<<domain_size / 512, 512, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers, ext_pow);
+                                 (inout, lg_blowup, bitrev, gen_powers);
 
         CUDA_OK(cudaGetLastError());
     }
 
 protected:
-    // coset_ext_pow is only used when NTT type is coset
     static void NTT_internal(fr_t* d_inout, uint32_t lg_domain_size,
                              InputOutputOrder order, Direction direction,
-                             Type type, stream_t& stream,
-                             bool coset_ext_pow = false)
+                             Type type, stream_t& stream)
     {
         // Pick an NTT algorithm based on the input order and the desired output
         // order of the data. In certain cases, bit reversal can be avoided which
@@ -116,8 +114,7 @@ protected:
         }
 
         if (!intt && type == Type::coset)
-            LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream,
-                       coset_ext_pow);
+            LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream);
 
         switch (algorithm) {
             case Algorithm::GS:
@@ -129,8 +126,7 @@ protected:
         }
 
         if (intt && type == Type::coset)
-            LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream,
-                       coset_ext_pow);
+            LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream);
 
         if (order == InputOutputOrder::RR)
             bit_rev(d_inout, d_inout, lg_domain_size, stream);
@@ -139,7 +135,7 @@ protected:
 public:
     static RustError Base(const gpu_t& gpu, fr_t* inout, uint32_t lg_domain_size,
                           InputOutputOrder order, Direction direction,
-                          Type type, bool coset_ext_pow = false)
+                          Type type)
     {
         if (lg_domain_size == 0)
             return RustError{cudaSuccess};
@@ -151,8 +147,7 @@ public:
             dev_ptr_t<fr_t> d_inout{domain_size, gpu};
             gpu.HtoD(&d_inout[0], inout, domain_size);
 
-            NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu,
-                         coset_ext_pow);
+            NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu);
 
             gpu.DtoH(inout, &d_inout[0], domain_size);
             gpu.sync();
@@ -169,8 +164,7 @@ public:
     }
 
     static RustError LDE(const gpu_t& gpu, fr_t* inout,
-                         uint32_t lg_domain_size, uint32_t lg_blowup,
-                         bool ext_pow = false)
+                         uint32_t lg_domain_size, uint32_t lg_blowup)
     {
         try {
             gpu.select();
@@ -190,7 +184,7 @@ public:
                 NTTParameters::all()[gpu.id()]->partial_group_gen_powers;
 
             LDE_launch(gpu, &d_ext_domain[0], &d_domain[0],
-                       gen_powers, lg_domain_size, lg_blowup, ext_pow);
+                       gen_powers, lg_domain_size, lg_blowup);
 
             NTT_internal(&d_ext_domain[0], lg_domain_size + lg_blowup,
                          InputOutputOrder::RN, Direction::forward,
@@ -251,8 +245,7 @@ protected:
 
 public:
     static RustError LDE_aux(const gpu_t& gpu, fr_t* inout,
-                             uint32_t lg_domain_size, uint32_t lg_blowup,
-                             bool ext_pow = false)
+                             uint32_t lg_domain_size, uint32_t lg_blowup)
     {
         try {
             size_t domain_size = (size_t)1 << lg_domain_size;
@@ -275,7 +268,7 @@ public:
             bit_rev(aux_data, domain_data, lg_domain_size, gpu);
 
             LDE_launch(gpu, ext_domain_data, domain_data, gen_powers,
-                       lg_domain_size, lg_blowup, true, ext_pow);
+                       lg_domain_size, lg_blowup);
 
             // NTT - RN
             NTT_internal(ext_domain_data, lg_domain_size + lg_blowup,
@@ -296,22 +289,20 @@ public:
         return RustError{cudaSuccess};
     }
 
-    // coset_ext_pow is only used when NTT type is coset
     static void Base_dev_ptr(stream_t& stream, fr_t* d_inout,
                              uint32_t lg_domain_size, InputOutputOrder order,
-                             Direction direction, Type type,
-                             bool coset_ext_pow = false)
+                             Direction direction, Type type)
     {
         size_t domain_size = (size_t)1 << lg_domain_size;
 
         NTT_internal(&d_inout[0], lg_domain_size, order, direction, type,
-                     stream, coset_ext_pow);
+                     stream);
     }
 
     static void LDE_powers(stream_t& stream, fr_t* d_inout,
-                           uint32_t lg_domain_size, bool ext_pow = false)
+                           uint32_t lg_domain_size)
     {
-        LDE_powers(d_inout, false, true, lg_domain_size, 0, stream, ext_pow);
+        LDE_powers(d_inout, false, true, lg_domain_size, 0, stream);
     }
 
     // If d_out and d_in overlap, d_out is expected to encompass d_in and

From a6799d9a8472dbd451b98a04dec9ad099fb7471a Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Mon, 27 Nov 2023 18:04:35 +0100
Subject: [PATCH 08/20] ntt/{kernels.cu,ntt.cuh}: ensure LDE_powers works even
 for large domain sizes.

---
 ntt/kernels.cu | 30 +++++++++++++++---------------
 ntt/ntt.cuh    | 15 ++++++++-------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/ntt/kernels.cu b/ntt/kernels.cu
index 3d78821..da000ee 100644
--- a/ntt/kernels.cu
+++ b/ntt/kernels.cu
@@ -175,26 +175,26 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE],
 }
 
 __launch_bounds__(1024) __global__
-void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev,
+void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_domain_size,
+                           uint32_t lg_blowup, bool bitrev,
                            const fr_t (*gen_powers)[WINDOW_SIZE])
 {
-    index_t idx = threadIdx.x + blockDim.x * (index_t)blockIdx.x;
-    index_t pow = idx;
-    fr_t r = d_inout[idx];
-
-    if (bitrev) {
-        size_t domain_size = gridDim.x * (size_t)blockDim.x;
-        assert((domain_size & (domain_size-1)) == 0);
-        uint32_t lg_domain_size = 63 - __clzll(domain_size);
-
-        pow = bit_rev(idx, lg_domain_size);
-    }
+#if 0
+    assert(blockDim.x * gridDim.x == blockDim.x * (size_t)gridDim.x);
+#endif
+    size_t domain_size = (size_t)1 << lg_domain_size;
+    index_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
-    pow <<= lg_blowup;
+    #pragma unroll 1
+    for (; idx < domain_size; idx += blockDim.x * gridDim.x) {
+        fr_t r = d_inout[idx];
 
-    r = r * get_intermediate_root(pow, gen_powers);
+        index_t pow = bitrev ? bit_rev(idx, lg_domain_size) : idx;
+        pow <<= lg_blowup;
+        r *= get_intermediate_root(pow, gen_powers);
 
-    d_inout[idx] = r;
+        d_inout[idx] = r;
+    }
 }
 
 __launch_bounds__(1024) __global__
diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh
index ca4fcd5..f68070e 100644
--- a/ntt/ntt.cuh
+++ b/ntt/ntt.cuh
@@ -57,22 +57,23 @@ protected:
 
 private:
     static void LDE_powers(fr_t* inout, bool innt, bool bitrev,
-                           uint32_t lg_domain_size, uint32_t lg_blowup,
+                           uint32_t lg_dsz, uint32_t lg_blowup,
                            stream_t& stream)
     {
-        size_t domain_size = (size_t)1 << lg_domain_size;
+        size_t domain_size = (size_t)1 << lg_dsz;
         const auto gen_powers =
             NTTParameters::all(innt)[stream]->partial_group_gen_powers;
 
         if (domain_size < WARP_SZ)
             LDE_distribute_powers<<<1, domain_size, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers);
-        else if (domain_size < 512)
+                                 (inout, lg_dsz, lg_blowup, bitrev, gen_powers);
+        else if (lg_dsz < 32)
             LDE_distribute_powers<<<domain_size / WARP_SZ, WARP_SZ, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers);
+                                 (inout, lg_dsz, lg_blowup, bitrev, gen_powers);
         else
-            LDE_distribute_powers<<<domain_size / 512, 512, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers);
+            LDE_distribute_powers<<<gpu_props(stream).multiProcessorCount, 1024,
+                                    0, stream>>>
+                                 (inout, lg_dsz, lg_blowup, bitrev, gen_powers);
 
         CUDA_OK(cudaGetLastError());
     }

From 3e9867c36544bb38bcd23703cd0050a5e39e5505 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Mon, 27 Nov 2023 18:14:12 +0100
Subject: [PATCH 09/20] ntt/ntt.cuh: make sense of |aux_data| in LDE_aux and
 de-duplicate LDE().

---
 ntt/ntt.cuh | 65 +++++++++++++++++------------------------------------
 1 file changed, 20 insertions(+), 45 deletions(-)

diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh
index f68070e..d783927 100644
--- a/ntt/ntt.cuh
+++ b/ntt/ntt.cuh
@@ -164,47 +164,6 @@ public:
         return RustError{cudaSuccess};
     }
 
-    static RustError LDE(const gpu_t& gpu, fr_t* inout,
-                         uint32_t lg_domain_size, uint32_t lg_blowup)
-    {
-        try {
-            gpu.select();
-
-            size_t domain_size = (size_t)1 << lg_domain_size;
-            size_t ext_domain_size = domain_size << lg_blowup;
-            dev_ptr_t<fr_t> d_ext_domain{ext_domain_size, gpu};
-            fr_t* d_domain = &d_ext_domain[ext_domain_size - domain_size];
-
-            gpu.HtoD(&d_domain[0], inout, domain_size);
-
-            NTT_internal(&d_domain[0], lg_domain_size,
-                         InputOutputOrder::NR, Direction::inverse,
-                         Type::standard, gpu);
-
-            const auto gen_powers =
-                NTTParameters::all()[gpu.id()]->partial_group_gen_powers;
-
-            LDE_launch(gpu, &d_ext_domain[0], &d_domain[0],
-                       gen_powers, lg_domain_size, lg_blowup);
-
-            NTT_internal(&d_ext_domain[0], lg_domain_size + lg_blowup,
-                         InputOutputOrder::RN, Direction::forward,
-                         Type::standard, gpu);
-
-            gpu.DtoH(inout, &d_ext_domain[0], ext_domain_size);
-            gpu.sync();
-        } catch (const cuda_error& e) {
-            gpu.sync();
-#ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE
-            return RustError{e.code(), e.what()};
-#else
-            return RustError{e.code()};
-#endif
-        }
-
-        return RustError{cudaSuccess};
-    }
-
 protected:
     static void LDE_launch(stream_t& stream,
                            fr_t* ext_domain_data, fr_t* domain_data,
@@ -246,17 +205,20 @@ protected:
 
 public:
     static RustError LDE_aux(const gpu_t& gpu, fr_t* inout,
-                             uint32_t lg_domain_size, uint32_t lg_blowup)
+                             uint32_t lg_domain_size, uint32_t lg_blowup,
+                             fr_t *aux_out = nullptr)
     {
         try {
             size_t domain_size = (size_t)1 << lg_domain_size;
             size_t ext_domain_size = domain_size << lg_blowup;
+            size_t aux_size = aux_out != nullptr ? domain_size : 0;
             // The 2nd to last 'domain_size' chunk will hold the original data
             // The last chunk will get the bit reversed iNTT data
-            dev_ptr_t<fr_t> d_inout{ext_domain_size + domain_size, gpu}; // + domain_size for aux buffer
+            dev_ptr_t<fr_t> d_inout{ext_domain_size + aux_size, gpu}; // + domain_size for aux buffer
             fr_t* aux_data = &d_inout[ext_domain_size];
             fr_t* domain_data = &d_inout[ext_domain_size - domain_size]; // aligned to the end
             fr_t* ext_domain_data = &d_inout[0];
+
             gpu.HtoD(domain_data, inout, domain_size);
 
             NTT_internal(domain_data, lg_domain_size,
@@ -266,7 +228,12 @@ public:
             const auto gen_powers =
                 NTTParameters::all()[gpu.id()]->partial_group_gen_powers;
 
-            bit_rev(aux_data, domain_data, lg_domain_size, gpu);
+            event_t sync_event;
+
+            if (aux_out != nullptr) {
+                bit_rev(aux_data, domain_data, lg_domain_size, gpu);
+                sync_event.record(gpu);
+            }
 
             LDE_launch(gpu, ext_domain_data, domain_data, gen_powers,
                        lg_domain_size, lg_blowup);
@@ -276,7 +243,11 @@ public:
                          InputOutputOrder::RN, Direction::forward,
                          Type::standard, gpu);
 
-            gpu.DtoH(inout, ext_domain_data, domain_size << lg_blowup);
+            if (aux_out != nullptr) {
+                sync_event.wait(gpu[0]);
+                gpu[0].DtoH(aux_out, aux_data, aux_size);
+            }
+            gpu.DtoH(inout, ext_domain_data, ext_domain_size);
             gpu.sync();
         } catch (const cuda_error& e) {
             gpu.sync();
@@ -290,6 +261,10 @@ public:
         return RustError{cudaSuccess};
     }
 
+    static RustError LDE(const gpu_t& gpu, fr_t* inout,
+                         uint32_t lg_domain_size, uint32_t lg_blowup)
+    {   return LDE_aux(gpu, inout, lg_domain_size, lg_blowup);   }
+
     static void Base_dev_ptr(stream_t& stream, fr_t* d_inout,
                              uint32_t lg_domain_size, InputOutputOrder order,
                              Direction direction, Type type)

From d575d33c079224ff982d6e8505408c76d22e0f26 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 12:38:43 +0100
Subject: [PATCH 10/20] poc/msm-cuda/Cargo.toml: clean up.

---
 poc/msm-cuda/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/poc/msm-cuda/Cargo.toml b/poc/msm-cuda/Cargo.toml
index 6b606b7..226e580 100644
--- a/poc/msm-cuda/Cargo.toml
+++ b/poc/msm-cuda/Cargo.toml
@@ -2,6 +2,7 @@
 name = "msm-cuda"
 version = "0.1.0"
 edition = "2021"
+publish = false
 
 [features]
 # By default, compile with ADX extension if the host supports it.
@@ -37,7 +38,6 @@ which = "^4.0"
 
 [dev-dependencies]
 criterion = { version = "0.3", features = [ "html_reports" ] }
-rayon = "1.5"
 
 [[bench]]
 name = "msm"

From 892c112a32a7b7babf86a1fe73593bcf40337882 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 12:39:25 +0100
Subject: [PATCH 11/20] util/thread_pool_t.hpp: resolve -Wunused-parameter on
 non-Linux.

---
 util/thread_pool_t.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/util/thread_pool_t.hpp b/util/thread_pool_t.hpp
index 5ab7385..ff394a6 100644
--- a/util/thread_pool_t.hpp
+++ b/util/thread_pool_t.hpp
@@ -127,6 +127,8 @@ class thread_pool_t {
 
             return;
         }
+#else
+        (void)affinity_env;
 #endif
         init(0);
     }

From 60e5157a7467e4f126d859fce98bac5d2ba50945 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 12:41:44 +0100
Subject: [PATCH 12/20] util/{gpu_t.cuh,rusterror.h}: make it compile on
 Windows.

---
 util/gpu_t.cuh   | 2 +-
 util/rusterror.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/gpu_t.cuh b/util/gpu_t.cuh
index e9e5eda..008a1f6 100644
--- a/util/gpu_t.cuh
+++ b/util/gpu_t.cuh
@@ -299,7 +299,7 @@ public:
     inline operator T*() const                  { return ptr->ptr; }
 
     // facilitate return by value through FFI, as gpu_ptr_t<T>::by_value.
-    struct by_value         { inner *ptr; };
+    using by_value = struct { inner *ptr; };
     operator by_value() const
     {   ptr->ref_cnt.fetch_add(1, std::memory_order_relaxed); return {ptr};   }
     gpu_ptr_t(by_value v)   { ptr = v.ptr; }
diff --git a/util/rusterror.h b/util/rusterror.h
index 9e8996e..02c0aff 100644
--- a/util/rusterror.h
+++ b/util/rusterror.h
@@ -24,7 +24,7 @@ struct RustError { /* to be returned exclusively by value */
     {   message = str==nullptr ? nullptr : strdup(str);   }
     // no destructor[!], Rust takes care of the |message|
 
-    struct by_value {
+    using by_value = struct {
         int code;
         char *message;
     };

From c536f0964f7dba7d5be5d385d1ccbcaaa7f2bfab Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 12:45:24 +0100
Subject: [PATCH 13/20] ff/*: add shfl_bfly() method.

---
 ff/bb31_t.cuh | 3 +++
 ff/gl64_t.cuh | 3 +++
 ff/mont_t.cuh | 7 +++++++
 3 files changed, 13 insertions(+)

diff --git a/ff/bb31_t.cuh b/ff/bb31_t.cuh
index 9e65c97..2bb3250 100644
--- a/ff/bb31_t.cuh
+++ b/ff/bb31_t.cuh
@@ -353,6 +353,9 @@ public:
     {   return a * b.reciprocal();   }
     inline bb31_t& operator/=(const bb31_t a)
     {   return *this *= a.reciprocal();   }
+
+    inline void shfl_bfly(uint32_t laneMask)
+    {   val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask);   }
 };
 
 #  undef inline
diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh
index 0866a66..fd67640 100644
--- a/ff/gl64_t.cuh
+++ b/ff/gl64_t.cuh
@@ -608,6 +608,9 @@ public:
 
         return t1;
     }
+
+    inline void shfl_bfly(uint32_t laneMask)
+    {   val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask);   }
 };
 
 # undef inline
diff --git a/ff/mont_t.cuh b/ff/mont_t.cuh
index cc5c8ee..e4aad31 100644
--- a/ff/mont_t.cuh
+++ b/ff/mont_t.cuh
@@ -1109,6 +1109,13 @@ public:
     {   return a * b.reciprocal();   }
     inline mont_t& operator/=(const mont_t& a)
     {   return *this *= a.reciprocal();   }
+
+    inline void shfl_bfly(uint32_t laneMask)
+    {
+        #pragma unroll
+        for (size_t i=0; i<n; i++)
+            even[i] = __shfl_xor_sync(0xFFFFFFFF, even[i], laneMask);
+    }
 };
 
 # undef inline

From e6089ae30815007e8b24981aea20fcff85251f11 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 12:46:44 +0100
Subject: [PATCH 14/20] ff/gl64_t.cuh: refactor reduce().

Better readability and marginal improvement in exponentiation.
---
 ff/gl64_t.cuh | 47 +++++++++++++++--------------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh
index fd67640..9ea17d7 100644
--- a/ff/gl64_t.cuh
+++ b/ff/gl64_t.cuh
@@ -267,45 +267,28 @@ private:
 
     inline void reduce(uint32_t temp[4])
     {
-        uint32_t carry;
 # if __CUDA_ARCH__ >= 700
-        asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;"
-            : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry)
-            : "r"(temp[2]), "r"(temp[3]));
-        asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;"
-            : "+r"(temp[1]), "+r"(carry)
-            : "r"(temp[2]), "r"(temp[3]));
-
-        asm("mad.lo.cc.u32 %0, %3, %4, %0; madc.hi.cc.u32 %1, %3, %4, %1; addc.u32 %2, 0, 0;"
-            : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2])
-            : "r"(carry), "r"(gl64_device::W));
-        asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
-            : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(temp[2]), "r"(gl64_device::W));
+        asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1; addc.u32 %2, 0, 0;"
+            : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2])
+            : "r"(gl64_device::W));
 # else
         uint32_t b0, b1;
-        asm("add.cc.u32 %0, %2, %3; addc.u32 %1, 0, 0;"
+
+        asm("sub.cc.u32 %0, 0, %2; subc.u32 %1, %2, 0;"
             : "=r"(b0), "=r"(b1)
-            : "r"(temp[2]), "r"(temp[3]));
-        asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;"
-            : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry)
+            : "r"(temp[2]));
+        asm("add.cc.u32 %0, %0, %3; addc.cc.u32 %1, %1, %4; addc.u32 %2, 0, 0;"
+            : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2])
             : "r"(b0), "r"(b1));
-        asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;"
-            : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(-carry), "r"(carry));
-        asm("add.cc.u32 %0, %0, %1; addc.u32 %1, 0, 0;"
-            : "+r"(temp[1]), "+r"(temp[2]));
+# endif
+        asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, 0; subc.u32 %2, %2, 0;"
+            : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2])
+            : "r"(temp[3]));
 
-#  if __CUDA_ARCH__ >= 700
-        asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+        asm("sub.cc.u32 %0, %0, %2; subc.u32 %1, %1, %3;"
             : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(temp[2]), "r"(gl64_device::W));
-#  else
-        asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, 0;"
-            : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(-temp[2]));
-#  endif
-# endif
+            : "r"(temp[2]), "r"(-(int)temp[2]>>1));
+
         asm("mov.b64 %0, {%1, %2};" : "=l"(val) : "r"(temp[0]), "r"(temp[1]));
     }
 

From e5add3fa08908dcabcdb6c01e990b390cd21c294 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 13:09:02 +0100
Subject: [PATCH 15/20] ntt/kernels/*: improve readability.

---
 ntt/kernels/ct_mixed_radix_narrow.cu | 2 +-
 ntt/kernels/ct_mixed_radix_wide.cu   | 2 +-
 ntt/kernels/gs_mixed_radix_narrow.cu | 2 +-
 ntt/kernels/gs_mixed_radix_wide.cu   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ntt/kernels/ct_mixed_radix_narrow.cu b/ntt/kernels/ct_mixed_radix_narrow.cu
index d90e70b..1c72a4b 100644
--- a/ntt/kernels/ct_mixed_radix_narrow.cu
+++ b/ntt/kernels/ct_mixed_radix_narrow.cu
@@ -156,7 +156,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage);
+    index_t mask = (index_t)((1 << iterations) - 1) << stage;
     index_t rotw = idx0 & mask;
     rotw = (rotw >> 1) | (rotw << (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu
index efa1a58..550eade 100644
--- a/ntt/kernels/ct_mixed_radix_wide.cu
+++ b/ntt/kernels/ct_mixed_radix_wide.cu
@@ -125,7 +125,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage);
+    index_t mask = (index_t)((1 << iterations) - 1) << stage;
     index_t rotw = idx0 & mask;
     rotw = (rotw >> 1) | (rotw << (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/kernels/gs_mixed_radix_narrow.cu b/ntt/kernels/gs_mixed_radix_narrow.cu
index 3f0946c..b8added 100644
--- a/ntt/kernels/gs_mixed_radix_narrow.cu
+++ b/ntt/kernels/gs_mixed_radix_narrow.cu
@@ -159,7 +159,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations));
+    index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations);
     index_t rotw = idx0 & mask;
     rotw = (rotw << 1) | (rotw >> (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu
index b6b94d4..7487e5d 100644
--- a/ntt/kernels/gs_mixed_radix_wide.cu
+++ b/ntt/kernels/gs_mixed_radix_wide.cu
@@ -119,7 +119,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations));
+    index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations);
     index_t rotw = idx0 & mask;
     rotw = (rotw << 1) | (rotw >> (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);

From c332cc9041f2f48d1e437c02eda75d25c3872ea3 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Tue, 2 Jan 2024 13:11:20 +0100
Subject: [PATCH 16/20] ntt/{kernels.cu,kernels/*}: switch to shfl_bfly()
 method and clean up.

---
 ntt/kernels.cu                       | 25 -------------------------
 ntt/kernels/ct_mixed_radix_narrow.cu |  2 +-
 ntt/kernels/ct_mixed_radix_wide.cu   |  2 +-
 ntt/kernels/gs_mixed_radix_narrow.cu |  2 +-
 ntt/kernels/gs_mixed_radix_wide.cu   |  2 +-
 5 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/ntt/kernels.cu b/ntt/kernels.cu
index da000ee..ee68b28 100644
--- a/ntt/kernels.cu
+++ b/ntt/kernels.cu
@@ -7,31 +7,6 @@
 
 #include <cooperative_groups.h>
 
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__
-void shfl_bfly(fr_t& r, int laneMask)
-{
-    #pragma unroll
-    for (int iter = 0; iter < r.len(); iter++)
-        r[iter] = __shfl_xor_sync(0xFFFFFFFF, r[iter], laneMask);
-}
-#endif
-
-__device__ __forceinline__
-void shfl_bfly(index_t& index, int laneMask)
-{
-    index = __shfl_xor_sync(0xFFFFFFFF, index, laneMask);
-}
-
-template<typename T>
-__device__ __forceinline__
-void swap(T& u1, T& u2)
-{
-    T temp = u1;
-    u1 = u2;
-    u2 = temp;
-}
-
 template<typename T>
 __device__ __forceinline__
 T bit_rev(T i, unsigned int nbits)
diff --git a/ntt/kernels/ct_mixed_radix_narrow.cu b/ntt/kernels/ct_mixed_radix_narrow.cu
index 1c72a4b..1f008df 100644
--- a/ntt/kernels/ct_mixed_radix_narrow.cu
+++ b/ntt/kernels/ct_mixed_radix_narrow.cu
@@ -100,7 +100,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         for (int z = 0; z < z_count; z++) {
             fr_t t = fr_t::csel(r[1][z], r[0][z], pos);
 
-            shfl_bfly(t, laneMask);
+            t.shfl_bfly(laneMask);
 
             r[0][z] = fr_t::csel(t, r[0][z], !pos);
             r[1][z] = fr_t::csel(t, r[1][z], pos);
diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu
index 550eade..eec4a61 100644
--- a/ntt/kernels/ct_mixed_radix_wide.cu
+++ b/ntt/kernels/ct_mixed_radix_wide.cu
@@ -80,7 +80,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
 
 #ifdef __CUDA_ARCH__
         fr_t x = fr_t::csel(r1, r0, pos);
-        shfl_bfly(x, laneMask);
+        x.shfl_bfly(laneMask);
         r0 = fr_t::csel(x, r0, !pos);
         r1 = fr_t::csel(x, r1, pos);
 #endif
diff --git a/ntt/kernels/gs_mixed_radix_narrow.cu b/ntt/kernels/gs_mixed_radix_narrow.cu
index b8added..a9f5f0a 100644
--- a/ntt/kernels/gs_mixed_radix_narrow.cu
+++ b/ntt/kernels/gs_mixed_radix_narrow.cu
@@ -104,7 +104,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
 #ifdef __CUDA_ARCH__
             t = fr_t::csel(r[1][z], r[0][z], pos);
 
-            shfl_bfly(t, laneMask);
+            t.shfl_bfly(laneMask);
 
             r[0][z] = fr_t::csel(t, r[0][z], !pos);
             r[1][z] = fr_t::csel(t, r[1][z], pos);
diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu
index 7487e5d..cd64da8 100644
--- a/ntt/kernels/gs_mixed_radix_wide.cu
+++ b/ntt/kernels/gs_mixed_radix_wide.cu
@@ -70,7 +70,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         bool pos = rank < laneMask;
 #ifdef __CUDA_ARCH__
         t = fr_t::csel(r1, r0, pos);
-        shfl_bfly(t, laneMask);
+        t.shfl_bfly(laneMask);
         r0 = fr_t::csel(t, r0, !pos);
         r1 = fr_t::csel(t, r1, pos);
 #endif

From 370916e5abb65be8cd3555dc70d624771ff47f3f Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Sun, 7 Jan 2024 21:52:54 +0100
Subject: [PATCH 17/20] ff/mont_t.cuh: fix unary operator-().

---
 ff/mont_t.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ff/mont_t.cuh b/ff/mont_t.cuh
index e4aad31..9ba34d6 100644
--- a/ff/mont_t.cuh
+++ b/ff/mont_t.cuh
@@ -311,10 +311,10 @@ public:
         asm("}");
         return *this;
     }
-    friend inline mont_t cneg(mont_t a, bool flag)
+    static inline mont_t cneg(mont_t a, bool flag)
     {   return a.cneg(flag);   }
 #else
-    friend inline mont_t cneg(const mont_t& a, bool flag)
+    static inline mont_t cneg(const mont_t& a, bool flag)
     {
         size_t i;
         uint32_t tmp[n], is_zero = a[0];
@@ -335,6 +335,8 @@ public:
         asm("}");
         return ret;
     }
+    inline mont_t& cneg(bool flag)
+    {   return *this = cneg(*this, flag);   }
 #endif
     inline mont_t operator-() const
     {   return cneg(*this, true);   }

From 65739f435088ff6d28f2e4baeb8404b1e4ad7836 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Sun, 7 Jan 2024 21:57:23 +0100
Subject: [PATCH 18/20] ff/{bb31,gl64}_t.cuh: harmonize unary operator-()
 implementations.

---
 ff/bb31_t.cuh | 4 ++--
 ff/gl64_t.cuh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ff/bb31_t.cuh b/ff/bb31_t.cuh
index 2bb3250..fb2a04f 100644
--- a/ff/bb31_t.cuh
+++ b/ff/bb31_t.cuh
@@ -104,10 +104,10 @@ public:
 
         return *this;
     }
-    friend inline bb31_t cneg(bb31_t a, bool flag)
+    static inline bb31_t cneg(bb31_t a, bool flag)
     {   return a.cneg(flag);   }
     inline bb31_t operator-() const
-    {   bb31_t ret = *this; return ret.cneg(true);   }
+    {   return cneg(*this, true);   }
 
     static inline const bb31_t one()    { return bb31_t{ONE}; }
     inline bool is_one() const          { return val == ONE;  }
diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh
index 9ea17d7..eb7164d 100644
--- a/ff/gl64_t.cuh
+++ b/ff/gl64_t.cuh
@@ -190,10 +190,10 @@ public:
 
         return *this;
     }
-    friend inline gl64_t cneg(gl64_t a, bool flag)
+    static inline gl64_t cneg(gl64_t a, bool flag)
     {   return a.cneg(flag);   }
     inline gl64_t operator-() const
-    {   gl64_t ret = *this; return ret.cneg(true);   }
+    {   return cneg(*this, true);   }
 
     static inline const gl64_t one()
     {   gl64_t ret; ret.val = 1; return ret;   }

From a24b42bfd37f740d4c0ecaa8d54b2eef5c3245a3 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Sun, 7 Jan 2024 22:03:59 +0100
Subject: [PATCH 19/20] rust/src/lib.rs: add NTT enums declarations.

---
 rust/src/lib.rs | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index a83f5ad..f047ff0 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -93,3 +93,23 @@ impl<T> Clone for Gpu_Ptr<T> {
         unsafe { transmute::<_, _>(clone_gpu_ptr_t(transmute::<&_, &_>(self))) }
     }
 }
+
+#[repr(C)]
+pub enum NTTInputOutputOrder {
+    NN = 0,
+    NR = 1,
+    RN = 2,
+    RR = 3,
+}
+
+#[repr(C)]
+pub enum NTTDirection {
+    Forward = 0,
+    Inverse = 1,
+}
+
+#[repr(C)]
+pub enum NTTType {
+    Standard = 0,
+    Coset = 1,
+}

From ada2ea2103fac3ae6b40a700acafb1f3c9cf6ed8 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@cryptogams.org>
Date: Sun, 7 Jan 2024 22:05:23 +0100
Subject: [PATCH 20/20] poc/ntt-cuda: modernize.

---
 poc/ntt-cuda/src/lib.rs   | 24 ++----------------------
 poc/ntt-cuda/tests/ntt.rs |  2 +-
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/poc/ntt-cuda/src/lib.rs b/poc/ntt-cuda/src/lib.rs
index 6847b4e..92162a1 100644
--- a/poc/ntt-cuda/src/lib.rs
+++ b/poc/ntt-cuda/src/lib.rs
@@ -2,27 +2,7 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-sppark::cuda_error!();
-
-#[repr(C)]
-pub enum NTTInputOutputOrder {
-    NN = 0,
-    NR = 1,
-    RN = 2,
-    RR = 3,
-}
-
-#[repr(C)]
-enum NTTDirection {
-    Forward = 0,
-    Inverse = 1,
-}
-
-#[repr(C)]
-enum NTTType {
-    Standard = 0,
-    Coset = 1,
-}
+use sppark::{NTTInputOutputOrder, NTTDirection, NTTType};
 
 extern "C" {
     fn compute_ntt(
@@ -32,7 +12,7 @@ extern "C" {
         ntt_order: NTTInputOutputOrder,
         ntt_direction: NTTDirection,
         ntt_type: NTTType,
-    ) -> cuda::Error;
+    ) -> sppark::Error;
 }
 
 /// Compute an in-place NTT on the input data.
diff --git a/poc/ntt-cuda/tests/ntt.rs b/poc/ntt-cuda/tests/ntt.rs
index 9f69a26..bb8c1ba 100644
--- a/poc/ntt-cuda/tests/ntt.rs
+++ b/poc/ntt-cuda/tests/ntt.rs
@@ -2,7 +2,7 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-use ntt_cuda::NTTInputOutputOrder;
+use sppark::NTTInputOutputOrder;
 
 const DEFAULT_GPU: usize = 0;