diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..cea979d
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,108 @@
+name: test-build
+
+on:
+  push:
+    branches:
+    - '**'
+  workflow_dispatch:
+    branches:
+    - '**'
+  pull_request:
+    branches:
+    - main
+
+jobs:
+  cargo-test-no-run:
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Get date
+      id: get-date
+      run: echo "date=$(date -u +%Y-%m)" >> $GITHUB_OUTPUT
+      shell: bash
+
+    - uses: actions/cache@v3
+      with:
+        path: |
+          ~/.cargo/registry
+          **/Cargo.lock
+          **/target
+        key: ${{ runner.os }}-cargo-${{ steps.get-date.outputs.date }}
+
+    - name: Environment
+      shell: bash
+      run: |
+        lscpu 2>/dev/null && echo --- || true
+        env | sort
+
+    - name: Install cuda-minimal-build-12-3
+      shell: bash
+      run: |
+        # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        sudo apt-get update
+        sudo apt-get -y install cuda-minimal-build-12-3
+        [ -d /usr/local/cuda-12.3/bin ]
+
+    - name: Test-build poc/ntt-cuda
+      shell: bash
+      run: |
+        rustc --version --verbose
+        export PATH=$PATH:/usr/local/cuda-12.3/bin
+        ( cd poc/ntt-cuda
+          cargo update
+          cargo test --no-run --release --features=bls12_381
+          cargo test --no-run --release --features=gl64
+          cargo test --no-run --release --features=bb31
+          if which clang++ 2>/dev/null; then
+              echo
+              echo Testing with clang++
+              echo
+              clang++ --version
+              echo
+              export CXX=clang++
+              cargo test --no-run --release --features=bls12_381
+              cargo test --no-run --release --features=gl64
+              cargo test --no-run --release --features=bb31
+          fi
+          cargo clean -p ntt-cuda
+          cargo clean -p ntt-cuda --release
+          rm -rf target/.rustc_info.json
+          rm -rf target/package
+          rm -rf target/{debug,release}/incremental
+          rm -rf target/*/{debug,release}/incremental
+        )
+
+    - name: Test-build poc/msm-cuda
+      shell: bash
+      run: |
+        rustc --version --verbose
+        export PATH=$PATH:/usr/local/cuda-12.3/bin
+        ( cd poc/msm-cuda
+          sed "s/^crit/#crit/" Cargo.toml > Cargo.$$.toml && \
+          mv Cargo.$$.toml Cargo.toml
+          cargo update
+          cargo test --no-run --release --features=bls12_381,quiet
+          cargo test --no-run --release --features=bn254,quiet
+          if which clang++ 2>/dev/null; then
+              echo
+              echo Testing with clang++
+              echo
+              clang++ --version
+              echo
+              export CXX=clang++
+              cargo test --no-run --release --features=bls12_381,quiet
+              cargo test --no-run --release --features=bn254,quiet
+          fi
+          cargo clean -p msm-cuda
+          cargo clean -p msm-cuda --release
+          rm -rf target/.rustc_info.json
+          rm -rf target/package
+          rm -rf target/{debug,release}/incremental
+          rm -rf target/*/{debug,release}/incremental
+        )
+        rm -rf ~/.cargo/registry/src
+        rm -rf ~/.cargo/registry/index/*/.cache
diff --git a/ff/baby_bear.hpp b/ff/baby_bear.hpp
index cce438a..c7439f0 100644
--- a/ff/baby_bear.hpp
+++ b/ff/baby_bear.hpp
@@ -27,18 +27,18 @@ class bb31_t {
     // this is used in constant declaration, e.g. as bb31_t{11}
     inline constexpr bb31_t(int a) : val(((uint64_t)a << 32) % MOD) {}
 
-    static inline const bb31_t one()            { return bb31_t(1); }
-    inline bb31_t& operator+=(bb31_t b)         { return *this;     }
-    inline bb31_t& operator-=(bb31_t b)         { return *this;     }
-    inline bb31_t& operator*=(bb31_t b)         { return *this;     }
-    inline bb31_t& operator^=(int b)            { return *this;     }
-    inline bb31_t& sqr()                        { return *this;     }
-    friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b;    }
-    friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b;    }
-    friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b;    }
-    friend bb31_t operator^(bb31_t a, int b)    { return a ^= b;    }
-    inline void zero()                          { val = 0;          }
-    inline bool is_zero() const                 { return val==0;    }
+    static inline const bb31_t one()                { return bb31_t(1); }
+    inline bb31_t& operator+=(bb31_t b)             { return *this;     }
+    inline bb31_t& operator-=(bb31_t b)             { return *this;     }
+    inline bb31_t& operator*=(bb31_t b)             { return *this;     }
+    inline bb31_t& operator^=(int b)                { return *this;     }
+    inline bb31_t& sqr()                            { return *this;     }
+    friend bb31_t operator+(bb31_t a, bb31_t b)     { return a += b;    }
+    friend bb31_t operator-(bb31_t a, bb31_t b)     { return a -= b;    }
+    friend bb31_t operator*(bb31_t a, bb31_t b)     { return a *= b;    }
+    friend bb31_t operator^(bb31_t a, uint32_t b)   { return a ^= b;    }
+    inline void zero()                              { val = 0;          }
+    inline bool is_zero() const                     { return val==0;    }
     inline operator uint32_t() const
     {   return ((val*M)*(uint64_t)MOD + val) >> 32;  }
     inline void to()    { val = ((uint64_t)val<<32) % MOD;  }
diff --git a/ff/bb31_t.cuh b/ff/bb31_t.cuh
index 9e65c97..fb2a04f 100644
--- a/ff/bb31_t.cuh
+++ b/ff/bb31_t.cuh
@@ -104,10 +104,10 @@ public:
 
         return *this;
     }
-    friend inline bb31_t cneg(bb31_t a, bool flag)
+    static inline bb31_t cneg(bb31_t a, bool flag)
     {   return a.cneg(flag);   }
     inline bb31_t operator-() const
-    {   bb31_t ret = *this; return ret.cneg(true);   }
+    {   return cneg(*this, true);   }
 
     static inline const bb31_t one()    { return bb31_t{ONE}; }
     inline bool is_one() const          { return val == ONE;  }
@@ -353,6 +353,9 @@ public:
     {   return a * b.reciprocal();   }
     inline bb31_t& operator/=(const bb31_t a)
     {   return *this *= a.reciprocal();   }
+
+    inline void shfl_bfly(uint32_t laneMask)
+    {   val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask);   }
 };
 
 #  undef inline
diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh
index 0866a66..eb7164d 100644
--- a/ff/gl64_t.cuh
+++ b/ff/gl64_t.cuh
@@ -190,10 +190,10 @@ public:
 
         return *this;
     }
-    friend inline gl64_t cneg(gl64_t a, bool flag)
+    static inline gl64_t cneg(gl64_t a, bool flag)
     {   return a.cneg(flag);   }
     inline gl64_t operator-() const
-    {   gl64_t ret = *this; return ret.cneg(true);   }
+    {   return cneg(*this, true);   }
 
     static inline const gl64_t one()
     {   gl64_t ret; ret.val = 1; return ret;   }
@@ -267,45 +267,28 @@ private:
 
     inline void reduce(uint32_t temp[4])
     {
-        uint32_t carry;
 # if __CUDA_ARCH__ >= 700
-        asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;"
-            : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry)
-            : "r"(temp[2]), "r"(temp[3]));
-        asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;"
-            : "+r"(temp[1]), "+r"(carry)
-            : "r"(temp[2]), "r"(temp[3]));
-
-        asm("mad.lo.cc.u32 %0, %3, %4, %0; madc.hi.cc.u32 %1, %3, %4, %1; addc.u32 %2, 0, 0;"
-            : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2])
-            : "r"(carry), "r"(gl64_device::W));
-        asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
-            : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(temp[2]), "r"(gl64_device::W));
+        asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1; addc.u32 %2, 0, 0;"
+            : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2])
+            : "r"(gl64_device::W));
 # else
         uint32_t b0, b1;
-        asm("add.cc.u32 %0, %2, %3; addc.u32 %1, 0, 0;"
+
+        asm("sub.cc.u32 %0, 0, %2; subc.u32 %1, %2, 0;"
             : "=r"(b0), "=r"(b1)
-            : "r"(temp[2]), "r"(temp[3]));
-        asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;"
-            : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry)
+            : "r"(temp[2]));
+        asm("add.cc.u32 %0, %0, %3; addc.cc.u32 %1, %1, %4; addc.u32 %2, 0, 0;"
+            : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2])
             : "r"(b0), "r"(b1));
-        asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;"
-            : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(-carry), "r"(carry));
-        asm("add.cc.u32 %0, %0, %1; addc.u32 %1, 0, 0;"
-            : "+r"(temp[1]), "+r"(temp[2]));
+# endif
+        asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, 0; subc.u32 %2, %2, 0;"
+            : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2])
+            : "r"(temp[3]));
 
-#  if __CUDA_ARCH__ >= 700
-        asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+        asm("sub.cc.u32 %0, %0, %2; subc.u32 %1, %1, %3;"
             : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(temp[2]), "r"(gl64_device::W));
-#  else
-        asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, 0;"
-            : "+r"(temp[0]), "+r"(temp[1])
-            : "r"(-temp[2]));
-#  endif
-# endif
+            : "r"(temp[2]), "r"(-(int)temp[2]>>1));
+
         asm("mov.b64 %0, {%1, %2};" : "=l"(val) : "r"(temp[0]), "r"(temp[1]));
     }
 
@@ -608,6 +591,9 @@ public:
 
         return t1;
     }
+
+    inline void shfl_bfly(uint32_t laneMask)
+    {   val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask);   }
 };
 
 # undef inline
diff --git a/ff/mont_t.cuh b/ff/mont_t.cuh
index cc5c8ee..9ba34d6 100644
--- a/ff/mont_t.cuh
+++ b/ff/mont_t.cuh
@@ -311,10 +311,10 @@ public:
         asm("}");
         return *this;
     }
-    friend inline mont_t cneg(mont_t a, bool flag)
+    static inline mont_t cneg(mont_t a, bool flag)
     {   return a.cneg(flag);   }
 #else
-    friend inline mont_t cneg(const mont_t& a, bool flag)
+    static inline mont_t cneg(const mont_t& a, bool flag)
     {
         size_t i;
         uint32_t tmp[n], is_zero = a[0];
@@ -335,6 +335,8 @@ public:
         asm("}");
         return ret;
     }
+    inline mont_t& cneg(bool flag)
+    {   return *this = cneg(*this, flag);   }
 #endif
     inline mont_t operator-() const
     {   return cneg(*this, true);   }
@@ -1109,6 +1111,13 @@ public:
     {   return a * b.reciprocal();   }
     inline mont_t& operator/=(const mont_t& a)
     {   return *this *= a.reciprocal();   }
+
+    inline void shfl_bfly(uint32_t laneMask)
+    {
+        #pragma unroll
+        for (size_t i=0; i<n; i++)
+            even[i] = __shfl_xor_sync(0xFFFFFFFF, even[i], laneMask);
+    }
 };
 
 # undef inline
diff --git a/ntt/kernels.cu b/ntt/kernels.cu
index 208c961..ee68b28 100644
--- a/ntt/kernels.cu
+++ b/ntt/kernels.cu
@@ -7,31 +7,6 @@
 
 #include <cooperative_groups.h>
 
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__
-void shfl_bfly(fr_t& r, int laneMask)
-{
-    #pragma unroll
-    for (int iter = 0; iter < r.len(); iter++)
-        r[iter] = __shfl_xor_sync(0xFFFFFFFF, r[iter], laneMask);
-}
-#endif
-
-__device__ __forceinline__
-void shfl_bfly(index_t& index, int laneMask)
-{
-    index = __shfl_xor_sync(0xFFFFFFFF, index, laneMask);
-}
-
-template<typename T>
-__device__ __forceinline__
-void swap(T& u1, T& u2)
-{
-    T temp = u1;
-    u1 = u2;
-    u2 = temp;
-}
-
 template<typename T>
 __device__ __forceinline__
 T bit_rev(T i, unsigned int nbits)
@@ -142,7 +117,31 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE],
 {
     unsigned int off = 0;
 
-    fr_t t, root = roots[off][pow % WINDOW_SIZE];
+    fr_t t, root;
+
+    if (sizeof(fr_t) <= 8) {
+        root = fr_t::one();
+        bool root_set = false;
+
+        #pragma unroll
+        for (unsigned int pow_win, i = 0; i < WINDOW_NUM; i++) {
+            if (!root_set && (pow_win = pow % WINDOW_SIZE)) {
+                root = roots[i][pow_win];
+                root_set = true;
+            }
+            if (!root_set) {
+                pow >>= LG_WINDOW_SIZE;
+                off++;
+            }
+        }
+    } else {
+        if ((pow % WINDOW_SIZE) == 0) {
+            pow >>= LG_WINDOW_SIZE;
+            off++;
+        }
+        root = roots[off][pow % WINDOW_SIZE];
+    }
+
     #pragma unroll 1
     while (pow >>= LG_WINDOW_SIZE)
         root *= (t = roots[++off][pow % WINDOW_SIZE]);
@@ -151,28 +150,26 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE],
 }
 
 __launch_bounds__(1024) __global__
-void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev,
-                           const fr_t (*gen_powers)[WINDOW_SIZE],
-                           bool ext_pow = false)
+void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_domain_size,
+                           uint32_t lg_blowup, bool bitrev,
+                           const fr_t (*gen_powers)[WINDOW_SIZE])
 {
-    index_t idx = threadIdx.x + blockDim.x * (index_t)blockIdx.x;
-    index_t pow = idx;
-    fr_t r = d_inout[idx];
-
-    if (bitrev) {
-        size_t domain_size = gridDim.x * (size_t)blockDim.x;
-        assert((domain_size & (domain_size-1)) == 0);
-        uint32_t lg_domain_size = 63 - __clzll(domain_size);
+#if 0
+    assert(blockDim.x * gridDim.x == blockDim.x * (size_t)gridDim.x);
+#endif
+    size_t domain_size = (size_t)1 << lg_domain_size;
+    index_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
-        pow = bit_rev(idx, lg_domain_size);
-    }
+    #pragma unroll 1
+    for (; idx < domain_size; idx += blockDim.x * gridDim.x) {
+        fr_t r = d_inout[idx];
 
-    if (ext_pow)
+        index_t pow = bitrev ? bit_rev(idx, lg_domain_size) : idx;
         pow <<= lg_blowup;
+        r *= get_intermediate_root(pow, gen_powers);
 
-    r = r * get_intermediate_root(pow, gen_powers);
-
-    d_inout[idx] = r;
+        d_inout[idx] = r;
+    }
 }
 
 __launch_bounds__(1024) __global__
@@ -252,13 +249,16 @@ void get_intermediate_roots(fr_t& root0, fr_t& root1,
 {
     int win = (WINDOW_NUM - 1) * LG_WINDOW_SIZE;
     int off = (WINDOW_NUM - 1);
+    index_t idxo = idx0 | idx1;
+    index_t mask = ((index_t)1 << win) - 1;
 
     root0 = roots[off][idx0 >> win];
     root1 = roots[off][idx1 >> win];
     #pragma unroll 1
-    while (off--) {
+    while (off-- && (idxo & mask)) {
         fr_t t;
         win -= LG_WINDOW_SIZE;
+        mask >>= LG_WINDOW_SIZE;
         root0 *= (t = roots[off][(idx0 >> win) % WINDOW_SIZE]);
         root1 *= (t = roots[off][(idx1 >> win) % WINDOW_SIZE]);
     }
diff --git a/ntt/kernels/ct_mixed_radix_narrow.cu b/ntt/kernels/ct_mixed_radix_narrow.cu
index d90e70b..1f008df 100644
--- a/ntt/kernels/ct_mixed_radix_narrow.cu
+++ b/ntt/kernels/ct_mixed_radix_narrow.cu
@@ -100,7 +100,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         for (int z = 0; z < z_count; z++) {
             fr_t t = fr_t::csel(r[1][z], r[0][z], pos);
 
-            shfl_bfly(t, laneMask);
+            t.shfl_bfly(laneMask);
 
             r[0][z] = fr_t::csel(t, r[0][z], !pos);
             r[1][z] = fr_t::csel(t, r[1][z], pos);
@@ -156,7 +156,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage);
+    index_t mask = (index_t)((1 << iterations) - 1) << stage;
     index_t rotw = idx0 & mask;
     rotw = (rotw >> 1) | (rotw << (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu
index 11366ad..eec4a61 100644
--- a/ntt/kernels/ct_mixed_radix_wide.cu
+++ b/ntt/kernels/ct_mixed_radix_wide.cu
@@ -43,12 +43,11 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         unsigned int nbits = MAX_LG_DOMAIN_SIZE - stage;
 
         index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos;
-        index_t root_idx1 = thread_ntt_pos << (nbits - 1);
+        index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1));
 
         fr_t first_root, second_root;
         get_intermediate_roots(first_root, second_root,
                                root_idx0, root_idx1, d_partial_twiddles);
-        second_root *= first_root;
 
         r0 *= first_root;
         r1 *= second_root;
@@ -81,7 +80,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
 
 #ifdef __CUDA_ARCH__
         fr_t x = fr_t::csel(r1, r0, pos);
-        shfl_bfly(x, laneMask);
+        x.shfl_bfly(laneMask);
         r0 = fr_t::csel(x, r0, !pos);
         r1 = fr_t::csel(x, r1, pos);
 #endif
@@ -126,7 +125,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage);
+    index_t mask = (index_t)((1 << iterations) - 1) << stage;
     index_t rotw = idx0 & mask;
     rotw = (rotw >> 1) | (rotw << (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/kernels/gs_mixed_radix_narrow.cu b/ntt/kernels/gs_mixed_radix_narrow.cu
index 3f0946c..a9f5f0a 100644
--- a/ntt/kernels/gs_mixed_radix_narrow.cu
+++ b/ntt/kernels/gs_mixed_radix_narrow.cu
@@ -104,7 +104,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
 #ifdef __CUDA_ARCH__
             t = fr_t::csel(r[1][z], r[0][z], pos);
 
-            shfl_bfly(t, laneMask);
+            t.shfl_bfly(laneMask);
 
             r[0][z] = fr_t::csel(t, r[0][z], !pos);
             r[1][z] = fr_t::csel(t, r[1][z], pos);
@@ -159,7 +159,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations));
+    index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations);
     index_t rotw = idx0 & mask;
     rotw = (rotw << 1) | (rotw >> (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu
index f5cd4bb..cd64da8 100644
--- a/ntt/kernels/gs_mixed_radix_wide.cu
+++ b/ntt/kernels/gs_mixed_radix_wide.cu
@@ -70,7 +70,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         bool pos = rank < laneMask;
 #ifdef __CUDA_ARCH__
         t = fr_t::csel(r1, r0, pos);
-        shfl_bfly(t, laneMask);
+        t.shfl_bfly(laneMask);
         r0 = fr_t::csel(t, r0, !pos);
         r1 = fr_t::csel(t, r1, pos);
 #endif
@@ -89,12 +89,11 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
         unsigned int nbits = MAX_LG_DOMAIN_SIZE - (stage - iterations);
 
         index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos;
-        index_t root_idx1 = thread_ntt_pos << (nbits - 1);
+        index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1));
 
         fr_t first_root, second_root;
         get_intermediate_roots(first_root, second_root,
                                root_idx0, root_idx1, d_partial_twiddles);
-        second_root *= first_root;
 
         r0 *= first_root;
         r1 *= second_root;
@@ -120,7 +119,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size,
     }
 
     // rotate "iterations" bits in indices
-    index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations));
+    index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations);
     index_t rotw = idx0 & mask;
     rotw = (rotw << 1) | (rotw >> (iterations - 1));
     idx0 = (idx0 & ~mask) | (rotw & mask);
diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh
index 734a9a5..d783927 100644
--- a/ntt/ntt.cuh
+++ b/ntt/ntt.cuh
@@ -57,32 +57,31 @@ protected:
 
 private:
     static void LDE_powers(fr_t* inout, bool innt, bool bitrev,
-                           uint32_t lg_domain_size, uint32_t lg_blowup,
-                           stream_t& stream, bool ext_pow = false)
+                           uint32_t lg_dsz, uint32_t lg_blowup,
+                           stream_t& stream)
     {
-        size_t domain_size = (size_t)1 << lg_domain_size;
+        size_t domain_size = (size_t)1 << lg_dsz;
         const auto gen_powers =
             NTTParameters::all(innt)[stream]->partial_group_gen_powers;
 
         if (domain_size < WARP_SZ)
             LDE_distribute_powers<<<1, domain_size, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers, ext_pow);
-        else if (domain_size < 512)
+                                 (inout, lg_dsz, lg_blowup, bitrev, gen_powers);
+        else if (lg_dsz < 32)
             LDE_distribute_powers<<<domain_size / WARP_SZ, WARP_SZ, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers, ext_pow);
+                                 (inout, lg_dsz, lg_blowup, bitrev, gen_powers);
         else
-            LDE_distribute_powers<<<domain_size / 512, 512, 0, stream>>>
-                                 (inout, lg_blowup, bitrev, gen_powers, ext_pow);
+            LDE_distribute_powers<<<gpu_props(stream).multiProcessorCount, 1024,
+                                    0, stream>>>
+                                 (inout, lg_dsz, lg_blowup, bitrev, gen_powers);
 
         CUDA_OK(cudaGetLastError());
     }
 
 protected:
-    // coset_ext_pow is only used when NTT type is coset
     static void NTT_internal(fr_t* d_inout, uint32_t lg_domain_size,
                              InputOutputOrder order, Direction direction,
-                             Type type, stream_t& stream,
-                             bool coset_ext_pow = false)
+                             Type type, stream_t& stream)
     {
         // Pick an NTT algorithm based on the input order and the desired output
         // order of the data. In certain cases, bit reversal can be avoided which
@@ -116,8 +115,7 @@ protected:
         }
 
         if (!intt && type == Type::coset)
-            LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream,
-                       coset_ext_pow);
+            LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream);
 
         switch (algorithm) {
             case Algorithm::GS:
@@ -129,8 +127,7 @@ protected:
         }
 
         if (intt && type == Type::coset)
-            LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream,
-                       coset_ext_pow);
+            LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream);
 
         if (order == InputOutputOrder::RR)
             bit_rev(d_inout, d_inout, lg_domain_size, stream);
@@ -139,7 +136,7 @@ protected:
 public:
     static RustError Base(const gpu_t& gpu, fr_t* inout, uint32_t lg_domain_size,
                           InputOutputOrder order, Direction direction,
-                          Type type, bool coset_ext_pow = false)
+                          Type type)
     {
         if (lg_domain_size == 0)
             return RustError{cudaSuccess};
@@ -151,8 +148,7 @@ public:
             dev_ptr_t<fr_t> d_inout{domain_size, gpu};
             gpu.HtoD(&d_inout[0], inout, domain_size);
 
-            NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu,
-                         coset_ext_pow);
+            NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu);
 
             gpu.DtoH(inout, &d_inout[0], domain_size);
             gpu.sync();
@@ -168,48 +164,6 @@ public:
         return RustError{cudaSuccess};
     }
 
-    static RustError LDE(const gpu_t& gpu, fr_t* inout,
-                         uint32_t lg_domain_size, uint32_t lg_blowup,
-                         bool ext_pow = false)
-    {
-        try {
-            gpu.select();
-
-            size_t domain_size = (size_t)1 << lg_domain_size;
-            size_t ext_domain_size = domain_size << lg_blowup;
-            dev_ptr_t<fr_t> d_ext_domain{ext_domain_size, gpu};
-            fr_t* d_domain = &d_ext_domain[ext_domain_size - domain_size];
-
-            gpu.HtoD(&d_domain[0], inout, domain_size);
-
-            NTT_internal(&d_domain[0], lg_domain_size,
-                         InputOutputOrder::NR, Direction::inverse,
-                         Type::standard, gpu);
-
-            const auto gen_powers =
-                NTTParameters::all()[gpu.id()]->partial_group_gen_powers;
-
-            LDE_launch(gpu, &d_ext_domain[0], &d_domain[0],
-                       gen_powers, lg_domain_size, lg_blowup, ext_pow);
-
-            NTT_internal(&d_ext_domain[0], lg_domain_size + lg_blowup,
-                         InputOutputOrder::RN, Direction::forward,
-                         Type::standard, gpu);
-
-            gpu.DtoH(inout, &d_ext_domain[0], ext_domain_size);
-            gpu.sync();
-        } catch (const cuda_error& e) {
-            gpu.sync();
-#ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE
-            return RustError{e.code(), e.what()};
-#else
-            return RustError{e.code()};
-#endif
-        }
-
-        return RustError{cudaSuccess};
-    }
-
 protected:
     static void LDE_launch(stream_t& stream,
                            fr_t* ext_domain_data, fr_t* domain_data,
@@ -252,17 +206,19 @@ protected:
 public:
     static RustError LDE_aux(const gpu_t& gpu, fr_t* inout,
                              uint32_t lg_domain_size, uint32_t lg_blowup,
-                             bool ext_pow = false)
+                             fr_t *aux_out = nullptr)
     {
         try {
             size_t domain_size = (size_t)1 << lg_domain_size;
             size_t ext_domain_size = domain_size << lg_blowup;
+            size_t aux_size = aux_out != nullptr ? domain_size : 0;
             // The 2nd to last 'domain_size' chunk will hold the original data
             // The last chunk will get the bit reversed iNTT data
-            dev_ptr_t<fr_t> d_inout{ext_domain_size + domain_size, gpu}; // + domain_size for aux buffer
+            dev_ptr_t<fr_t> d_inout{ext_domain_size + aux_size, gpu}; // + domain_size for aux buffer
             fr_t* aux_data = &d_inout[ext_domain_size];
             fr_t* domain_data = &d_inout[ext_domain_size - domain_size]; // aligned to the end
             fr_t* ext_domain_data = &d_inout[0];
+
             gpu.HtoD(domain_data, inout, domain_size);
 
             NTT_internal(domain_data, lg_domain_size,
@@ -272,17 +228,26 @@ public:
             const auto gen_powers =
                 NTTParameters::all()[gpu.id()]->partial_group_gen_powers;
 
-            bit_rev(aux_data, domain_data, lg_domain_size, gpu);
+            event_t sync_event;
+
+            if (aux_out != nullptr) {
+                bit_rev(aux_data, domain_data, lg_domain_size, gpu);
+                sync_event.record(gpu);
+            }
 
             LDE_launch(gpu, ext_domain_data, domain_data, gen_powers,
-                       lg_domain_size, lg_blowup, true, ext_pow);
+                       lg_domain_size, lg_blowup);
 
             // NTT - RN
             NTT_internal(ext_domain_data, lg_domain_size + lg_blowup,
                          InputOutputOrder::RN, Direction::forward,
                          Type::standard, gpu);
 
-            gpu.DtoH(inout, ext_domain_data, domain_size << lg_blowup);
+            if (aux_out != nullptr) {
+                sync_event.wait(gpu[0]);
+                gpu[0].DtoH(aux_out, aux_data, aux_size);
+            }
+            gpu.DtoH(inout, ext_domain_data, ext_domain_size);
             gpu.sync();
         } catch (const cuda_error& e) {
             gpu.sync();
@@ -296,22 +261,24 @@ public:
         return RustError{cudaSuccess};
     }
 
-    // coset_ext_pow is only used when NTT type is coset
+    static RustError LDE(const gpu_t& gpu, fr_t* inout,
+                         uint32_t lg_domain_size, uint32_t lg_blowup)
+    {   return LDE_aux(gpu, inout, lg_domain_size, lg_blowup);   }
+
     static void Base_dev_ptr(stream_t& stream, fr_t* d_inout,
                              uint32_t lg_domain_size, InputOutputOrder order,
-                             Direction direction, Type type,
-                             bool coset_ext_pow = false)
+                             Direction direction, Type type)
     {
         size_t domain_size = (size_t)1 << lg_domain_size;
 
         NTT_internal(&d_inout[0], lg_domain_size, order, direction, type,
-                     stream, coset_ext_pow);
+                     stream);
     }
 
     static void LDE_powers(stream_t& stream, fr_t* d_inout,
-                           uint32_t lg_domain_size, bool ext_pow = false)
+                           uint32_t lg_domain_size)
     {
-        LDE_powers(d_inout, false, true, lg_domain_size, 0, stream, ext_pow);
+        LDE_powers(d_inout, false, true, lg_domain_size, 0, stream);
     }
 
     // If d_out and d_in overlap, d_out is expected to encompass d_in and
diff --git a/poc/msm-cuda/Cargo.toml b/poc/msm-cuda/Cargo.toml
index 6b606b7..226e580 100644
--- a/poc/msm-cuda/Cargo.toml
+++ b/poc/msm-cuda/Cargo.toml
@@ -2,6 +2,7 @@
 name = "msm-cuda"
 version = "0.1.0"
 edition = "2021"
+publish = false
 
 [features]
 # By default, compile with ADX extension if the host supports it.
@@ -37,7 +38,6 @@ which = "^4.0"
 
 [dev-dependencies]
 criterion = { version = "0.3", features = [ "html_reports" ] }
-rayon = "1.5"
 
 [[bench]]
 name = "msm"
diff --git a/poc/ntt-cuda/Cargo.toml b/poc/ntt-cuda/Cargo.toml
index fdd1822..3498b5d 100644
--- a/poc/ntt-cuda/Cargo.toml
+++ b/poc/ntt-cuda/Cargo.toml
@@ -32,8 +32,6 @@ sppark = { path = "../../rust" }
 cc = "^1.0.70"
 
 [dev-dependencies]
-criterion = { version = "0.3", features = [ "html_reports" ] }
-rayon = "1.5"
 rand = "^0"
 ark-std = "0.3.0"
 ark-ff = "0.3.0"
diff --git a/poc/ntt-cuda/cuda/ntt_api.cu b/poc/ntt-cuda/cuda/ntt_api.cu
index 3ca32fe..38f4d78 100644
--- a/poc/ntt-cuda/cuda/ntt_api.cu
+++ b/poc/ntt-cuda/cuda/ntt_api.cu
@@ -27,10 +27,11 @@
 #ifndef __CUDA_ARCH__
 
 extern "C"
-RustError compute_ntt(size_t device_id, fr_t* inout, uint32_t lg_domain_size,
-                      NTT::InputOutputOrder ntt_order,
-                      NTT::Direction ntt_direction,
-                      NTT::Type ntt_type)
+RustError::by_value compute_ntt(size_t device_id,
+                                fr_t* inout, uint32_t lg_domain_size,
+                                NTT::InputOutputOrder ntt_order,
+                                NTT::Direction ntt_direction,
+                                NTT::Type ntt_type)
 {
     auto& gpu = select_gpu(device_id);
 
diff --git a/poc/ntt-cuda/src/lib.rs b/poc/ntt-cuda/src/lib.rs
index 6847b4e..92162a1 100644
--- a/poc/ntt-cuda/src/lib.rs
+++ b/poc/ntt-cuda/src/lib.rs
@@ -2,27 +2,7 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-sppark::cuda_error!();
-
-#[repr(C)]
-pub enum NTTInputOutputOrder {
-    NN = 0,
-    NR = 1,
-    RN = 2,
-    RR = 3,
-}
-
-#[repr(C)]
-enum NTTDirection {
-    Forward = 0,
-    Inverse = 1,
-}
-
-#[repr(C)]
-enum NTTType {
-    Standard = 0,
-    Coset = 1,
-}
+use sppark::{NTTInputOutputOrder, NTTDirection, NTTType};
 
 extern "C" {
     fn compute_ntt(
@@ -32,7 +12,7 @@ extern "C" {
         ntt_order: NTTInputOutputOrder,
         ntt_direction: NTTDirection,
         ntt_type: NTTType,
-    ) -> cuda::Error;
+    ) -> sppark::Error;
 }
 
 /// Compute an in-place NTT on the input data.
diff --git a/poc/ntt-cuda/tests/ntt.rs b/poc/ntt-cuda/tests/ntt.rs
index 9f69a26..bb8c1ba 100644
--- a/poc/ntt-cuda/tests/ntt.rs
+++ b/poc/ntt-cuda/tests/ntt.rs
@@ -2,7 +2,7 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-use ntt_cuda::NTTInputOutputOrder;
+use sppark::NTTInputOutputOrder;
 
 const DEFAULT_GPU: usize = 0;
 
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index a83f5ad..f047ff0 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -93,3 +93,23 @@ impl<T> Clone for Gpu_Ptr<T> {
         unsafe { transmute::<_, _>(clone_gpu_ptr_t(transmute::<&_, &_>(self))) }
     }
 }
+
+#[repr(C)]
+pub enum NTTInputOutputOrder {
+    NN = 0,
+    NR = 1,
+    RN = 2,
+    RR = 3,
+}
+
+#[repr(C)]
+pub enum NTTDirection {
+    Forward = 0,
+    Inverse = 1,
+}
+
+#[repr(C)]
+pub enum NTTType {
+    Standard = 0,
+    Coset = 1,
+}
diff --git a/util/gpu_t.cuh b/util/gpu_t.cuh
index e9e5eda..008a1f6 100644
--- a/util/gpu_t.cuh
+++ b/util/gpu_t.cuh
@@ -299,7 +299,7 @@ public:
     inline operator T*() const                  { return ptr->ptr; }
 
     // facilitate return by value through FFI, as gpu_ptr_t<T>::by_value.
-    struct by_value         { inner *ptr; };
+    using by_value = struct { inner *ptr; };
     operator by_value() const
     {   ptr->ref_cnt.fetch_add(1, std::memory_order_relaxed); return {ptr};   }
     gpu_ptr_t(by_value v)   { ptr = v.ptr; }
diff --git a/util/rusterror.h b/util/rusterror.h
index 9e8996e..02c0aff 100644
--- a/util/rusterror.h
+++ b/util/rusterror.h
@@ -24,7 +24,7 @@ struct RustError { /* to be returned exclusively by value */
     {   message = str==nullptr ? nullptr : strdup(str);   }
     // no destructor[!], Rust takes care of the |message|
 
-    struct by_value {
+    using by_value = struct {
         int code;
         char *message;
     };
diff --git a/util/thread_pool_t.hpp b/util/thread_pool_t.hpp
index 5ab7385..ff394a6 100644
--- a/util/thread_pool_t.hpp
+++ b/util/thread_pool_t.hpp
@@ -127,6 +127,8 @@ class thread_pool_t {
 
             return;
         }
+#else
+        (void)affinity_env;
 #endif
         init(0);
     }