diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..cea979d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,108 @@ +name: test-build + +on: + push: + branches: + - '**' + workflow_dispatch: + branches: + - '**' + pull_request: + branches: + - main + +jobs: + cargo-test-no-run: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Get date + id: get-date + run: echo "date=$(date -u +%Y-%m)" >> $GITHUB_OUTPUT + shell: bash + + - uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry + **/Cargo.lock + **/target + key: ${{ runner.os }}-cargo-${{ steps.get-date.outputs.date }} + + - name: Environment + shell: bash + run: | + lscpu 2>/dev/null && echo --- || true + env | sort + + - name: Install cuda-minimal-build-12-3 + shell: bash + run: | + # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get -y install cuda-minimal-build-12-3 + [ -d /usr/local/cuda-12.3/bin ] + + - name: Test-build poc/ntt-cuda + shell: bash + run: | + rustc --version --verbose + export PATH=$PATH:/usr/local/cuda-12.3/bin + ( cd poc/ntt-cuda + cargo update + cargo test --no-run --release --features=bls12_381 + cargo test --no-run --release --features=gl64 + cargo test --no-run --release --features=bb31 + if which clang++ 2>/dev/null; then + echo + echo Testing with clang++ + echo + clang++ --version + echo + export CXX=clang++ + cargo test --no-run --release --features=bls12_381 + cargo test --no-run --release --features=gl64 + cargo test --no-run --release --features=bb31 + fi + cargo clean -p ntt-cuda + cargo clean -p ntt-cuda --release + rm -rf target/.rustc_info.json + rm -rf target/package + rm -rf target/{debug,release}/incremental + rm -rf target/*/{debug,release}/incremental + ) + + - name: Test-build poc/msm-cuda + shell: bash + run: | + rustc --version --verbose + export PATH=$PATH:/usr/local/cuda-12.3/bin + ( cd poc/msm-cuda + sed "s/^crit/#crit/" Cargo.toml > Cargo.$$.toml && \ + mv Cargo.$$.toml Cargo.toml + cargo update + cargo test --no-run --release --features=bls12_381,quiet + cargo test --no-run --release --features=bn254,quiet + if which clang++ 2>/dev/null; then + echo + echo Testing with clang++ + echo + clang++ --version + echo + export CXX=clang++ + cargo test --no-run --release --features=bls12_381,quiet + cargo test --no-run --release --features=bn254,quiet + fi + cargo clean -p msm-cuda + cargo clean -p msm-cuda --release + rm -rf target/.rustc_info.json + rm -rf target/package + rm -rf target/{debug,release}/incremental + rm -rf target/*/{debug,release}/incremental + ) + rm -rf ~/.cargo/registry/src + rm -rf ~/.cargo/registry/index/*/.cache diff --git a/ff/baby_bear.hpp b/ff/baby_bear.hpp index cce438a..c7439f0 100644 --- a/ff/baby_bear.hpp +++ b/ff/baby_bear.hpp @@ -27,18 +27,18 @@ class bb31_t { // this is used in constant declaration, e.g. as bb31_t{11} inline constexpr bb31_t(int a) : val(((uint64_t)a << 32) % MOD) {} - static inline const bb31_t one() { return bb31_t(1); } - inline bb31_t& operator+=(bb31_t b) { return *this; } - inline bb31_t& operator-=(bb31_t b) { return *this; } - inline bb31_t& operator*=(bb31_t b) { return *this; } - inline bb31_t& operator^=(int b) { return *this; } - inline bb31_t& sqr() { return *this; } - friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b; } - friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b; } - friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b; } - friend bb31_t operator^(bb31_t a, int b) { return a ^= b; } - inline void zero() { val = 0; } - inline bool is_zero() const { return val==0; } + static inline const bb31_t one() { return bb31_t(1); } + inline bb31_t& operator+=(bb31_t b) { return *this; } + inline bb31_t& operator-=(bb31_t b) { return *this; } + inline bb31_t& operator*=(bb31_t b) { return *this; } + inline bb31_t& operator^=(int b) { return *this; } + inline bb31_t& sqr() { return *this; } + friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b; } + friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b; } + friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b; } + friend bb31_t operator^(bb31_t a, uint32_t b) { return a ^= b; } + inline void zero() { val = 0; } + inline bool is_zero() const { return val==0; } inline operator uint32_t() const { return ((val*M)*(uint64_t)MOD + val) >> 32; } inline void to() { val = ((uint64_t)val<<32) % MOD; } diff --git a/ff/bb31_t.cuh b/ff/bb31_t.cuh index 9e65c97..fb2a04f 100644 --- a/ff/bb31_t.cuh +++ b/ff/bb31_t.cuh @@ -104,10 +104,10 @@ public: return *this; } - friend inline bb31_t cneg(bb31_t a, bool flag) + static inline bb31_t cneg(bb31_t a, bool flag) { return a.cneg(flag); } inline bb31_t operator-() const - { bb31_t ret = *this; return ret.cneg(true); } + { return cneg(*this, true); } static inline const bb31_t one() { return bb31_t{ONE}; } inline bool is_one() const { return val == ONE; } @@ -353,6 +353,9 @@ public: { return a * b.reciprocal(); } inline bb31_t& operator/=(const bb31_t a) { return *this *= a.reciprocal(); } + + inline void shfl_bfly(uint32_t laneMask) + { val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask); } }; # undef inline diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh index 0866a66..eb7164d 100644 --- a/ff/gl64_t.cuh +++ b/ff/gl64_t.cuh @@ -190,10 +190,10 @@ public: return *this; } - friend inline gl64_t cneg(gl64_t a, bool flag) + static inline gl64_t cneg(gl64_t a, bool flag) { return a.cneg(flag); } inline gl64_t operator-() const - { gl64_t ret = *this; return ret.cneg(true); } + { return cneg(*this, true); } static inline const gl64_t one() { gl64_t ret; ret.val = 1; return ret; } @@ -267,45 +267,28 @@ private: inline void reduce(uint32_t temp[4]) { - uint32_t carry; # if __CUDA_ARCH__ >= 700 - asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;" - : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry) - : "r"(temp[2]), "r"(temp[3])); - asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;" - : "+r"(temp[1]), "+r"(carry) - : "r"(temp[2]), "r"(temp[3])); - - asm("mad.lo.cc.u32 %0, %3, %4, %0; madc.hi.cc.u32 %1, %3, %4, %1; addc.u32 %2, 0, 0;" - : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2]) - : "r"(carry), "r"(gl64_device::W)); - asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;" - : "+r"(temp[0]), "+r"(temp[1]) - : "r"(temp[2]), "r"(gl64_device::W)); + asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1; addc.u32 %2, 0, 0;" + : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2]) + : "r"(gl64_device::W)); # else uint32_t b0, b1; - asm("add.cc.u32 %0, %2, %3; addc.u32 %1, 0, 0;" + + asm("sub.cc.u32 %0, 0, %2; subc.u32 %1, %2, 0;" : "=r"(b0), "=r"(b1) - : "r"(temp[2]), "r"(temp[3])); - asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;" - : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry) + : "r"(temp[2])); + asm("add.cc.u32 %0, %0, %3; addc.cc.u32 %1, %1, %4; addc.u32 %2, 0, 0;" + : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2]) : "r"(b0), "r"(b1)); - asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;" - : "+r"(temp[0]), "+r"(temp[1]) - : "r"(-carry), "r"(carry)); - asm("add.cc.u32 %0, %0, %1; addc.u32 %1, 0, 0;" - : "+r"(temp[1]), "+r"(temp[2])); +# endif + asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, 0; subc.u32 %2, %2, 0;" + : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2]) + : "r"(temp[3])); -# if __CUDA_ARCH__ >= 700 - asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;" + asm("sub.cc.u32 %0, %0, %2; subc.u32 %1, %1, %3;" : "+r"(temp[0]), "+r"(temp[1]) - : "r"(temp[2]), "r"(gl64_device::W)); -# else - asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, 0;" - : "+r"(temp[0]), "+r"(temp[1]) - : "r"(-temp[2])); -# endif -# endif + : "r"(temp[2]), "r"(-(int)temp[2]>>1)); + asm("mov.b64 %0, {%1, %2};" : "=l"(val) : "r"(temp[0]), "r"(temp[1])); } @@ -608,6 +591,9 @@ public: return t1; } + + inline void shfl_bfly(uint32_t laneMask) + { val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask); } }; # undef inline diff --git a/ff/mont_t.cuh b/ff/mont_t.cuh index cc5c8ee..9ba34d6 100644 --- a/ff/mont_t.cuh +++ b/ff/mont_t.cuh @@ -311,10 +311,10 @@ public: asm("}"); return *this; } - friend inline mont_t cneg(mont_t a, bool flag) + static inline mont_t cneg(mont_t a, bool flag) { return a.cneg(flag); } #else - friend inline mont_t cneg(const mont_t& a, bool flag) + static inline mont_t cneg(const mont_t& a, bool flag) { size_t i; uint32_t tmp[n], is_zero = a[0]; @@ -335,6 +335,8 @@ public: asm("}"); return ret; } + inline mont_t& cneg(bool flag) + { return *this = cneg(*this, flag); } #endif inline mont_t operator-() const { return cneg(*this, true); } @@ -1109,6 +1111,13 @@ public: { return a * b.reciprocal(); } inline mont_t& operator/=(const mont_t& a) { return *this *= a.reciprocal(); } + + inline void shfl_bfly(uint32_t laneMask) + { + #pragma unroll + for (size_t i=0; i -#ifdef __CUDA_ARCH__ -__device__ __forceinline__ -void shfl_bfly(fr_t& r, int laneMask) -{ - #pragma unroll - for (int iter = 0; iter < r.len(); iter++) - r[iter] = __shfl_xor_sync(0xFFFFFFFF, r[iter], laneMask); -} -#endif - -__device__ __forceinline__ -void shfl_bfly(index_t& index, int laneMask) -{ - index = __shfl_xor_sync(0xFFFFFFFF, index, laneMask); -} - -template -__device__ __forceinline__ -void swap(T& u1, T& u2) -{ - T temp = u1; - u1 = u2; - u2 = temp; -} - template __device__ __forceinline__ T bit_rev(T i, unsigned int nbits) @@ -142,7 +117,31 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE], { unsigned int off = 0; - fr_t t, root = roots[off][pow % WINDOW_SIZE]; + fr_t t, root; + + if (sizeof(fr_t) <= 8) { + root = fr_t::one(); + bool root_set = false; + + #pragma unroll + for (unsigned int pow_win, i = 0; i < WINDOW_NUM; i++) { + if (!root_set && (pow_win = pow % WINDOW_SIZE)) { + root = roots[i][pow_win]; + root_set = true; + } + if (!root_set) { + pow >>= LG_WINDOW_SIZE; + off++; + } + } + } else { + if ((pow % WINDOW_SIZE) == 0) { + pow >>= LG_WINDOW_SIZE; + off++; + } + root = roots[off][pow % WINDOW_SIZE]; + } + #pragma unroll 1 while (pow >>= LG_WINDOW_SIZE) root *= (t = roots[++off][pow % WINDOW_SIZE]); @@ -151,28 +150,26 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE], } __launch_bounds__(1024) __global__ -void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev, - const fr_t (*gen_powers)[WINDOW_SIZE], - bool ext_pow = false) +void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_domain_size, + uint32_t lg_blowup, bool bitrev, + const fr_t (*gen_powers)[WINDOW_SIZE]) { - index_t idx = threadIdx.x + blockDim.x * (index_t)blockIdx.x; - index_t pow = idx; - fr_t r = d_inout[idx]; - - if (bitrev) { - size_t domain_size = gridDim.x * (size_t)blockDim.x; - assert((domain_size & (domain_size-1)) == 0); - uint32_t lg_domain_size = 63 - __clzll(domain_size); +#if 0 + assert(blockDim.x * gridDim.x == blockDim.x * (size_t)gridDim.x); +#endif + size_t domain_size = (size_t)1 << lg_domain_size; + index_t idx = threadIdx.x + blockDim.x * blockIdx.x; - pow = bit_rev(idx, lg_domain_size); - } + #pragma unroll 1 + for (; idx < domain_size; idx += blockDim.x * gridDim.x) { + fr_t r = d_inout[idx]; - if (ext_pow) + index_t pow = bitrev ? bit_rev(idx, lg_domain_size) : idx; pow <<= lg_blowup; + r *= get_intermediate_root(pow, gen_powers); - r = r * get_intermediate_root(pow, gen_powers); - - d_inout[idx] = r; + d_inout[idx] = r; + } } __launch_bounds__(1024) __global__ @@ -252,13 +249,16 @@ void get_intermediate_roots(fr_t& root0, fr_t& root1, { int win = (WINDOW_NUM - 1) * LG_WINDOW_SIZE; int off = (WINDOW_NUM - 1); + index_t idxo = idx0 | idx1; + index_t mask = ((index_t)1 << win) - 1; root0 = roots[off][idx0 >> win]; root1 = roots[off][idx1 >> win]; #pragma unroll 1 - while (off--) { + while (off-- && (idxo & mask)) { fr_t t; win -= LG_WINDOW_SIZE; + mask >>= LG_WINDOW_SIZE; root0 *= (t = roots[off][(idx0 >> win) % WINDOW_SIZE]); root1 *= (t = roots[off][(idx1 >> win) % WINDOW_SIZE]); } diff --git a/ntt/kernels/ct_mixed_radix_narrow.cu b/ntt/kernels/ct_mixed_radix_narrow.cu index d90e70b..1f008df 100644 --- a/ntt/kernels/ct_mixed_radix_narrow.cu +++ b/ntt/kernels/ct_mixed_radix_narrow.cu @@ -100,7 +100,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, for (int z = 0; z < z_count; z++) { fr_t t = fr_t::csel(r[1][z], r[0][z], pos); - shfl_bfly(t, laneMask); + t.shfl_bfly(laneMask); r[0][z] = fr_t::csel(t, r[0][z], !pos); r[1][z] = fr_t::csel(t, r[1][z], pos); @@ -156,7 +156,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage); + index_t mask = (index_t)((1 << iterations) - 1) << stage; index_t rotw = idx0 & mask; rotw = (rotw >> 1) | (rotw << (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu index 11366ad..eec4a61 100644 --- a/ntt/kernels/ct_mixed_radix_wide.cu +++ b/ntt/kernels/ct_mixed_radix_wide.cu @@ -43,12 +43,11 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, unsigned int nbits = MAX_LG_DOMAIN_SIZE - stage; index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos; - index_t root_idx1 = thread_ntt_pos << (nbits - 1); + index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1)); fr_t first_root, second_root; get_intermediate_roots(first_root, second_root, root_idx0, root_idx1, d_partial_twiddles); - second_root *= first_root; r0 *= first_root; r1 *= second_root; @@ -81,7 +80,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, #ifdef __CUDA_ARCH__ fr_t x = fr_t::csel(r1, r0, pos); - shfl_bfly(x, laneMask); + x.shfl_bfly(laneMask); r0 = fr_t::csel(x, r0, !pos); r1 = fr_t::csel(x, r1, pos); #endif @@ -126,7 +125,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage); + index_t mask = (index_t)((1 << iterations) - 1) << stage; index_t rotw = idx0 & mask; rotw = (rotw >> 1) | (rotw << (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/kernels/gs_mixed_radix_narrow.cu b/ntt/kernels/gs_mixed_radix_narrow.cu index 3f0946c..a9f5f0a 100644 --- a/ntt/kernels/gs_mixed_radix_narrow.cu +++ b/ntt/kernels/gs_mixed_radix_narrow.cu @@ -104,7 +104,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, #ifdef __CUDA_ARCH__ t = fr_t::csel(r[1][z], r[0][z], pos); - shfl_bfly(t, laneMask); + t.shfl_bfly(laneMask); r[0][z] = fr_t::csel(t, r[0][z], !pos); r[1][z] = fr_t::csel(t, r[1][z], pos); @@ -159,7 +159,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations)); + index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations); index_t rotw = idx0 & mask; rotw = (rotw << 1) | (rotw >> (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu index f5cd4bb..cd64da8 100644 --- a/ntt/kernels/gs_mixed_radix_wide.cu +++ b/ntt/kernels/gs_mixed_radix_wide.cu @@ -70,7 +70,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, bool pos = rank < laneMask; #ifdef __CUDA_ARCH__ t = fr_t::csel(r1, r0, pos); - shfl_bfly(t, laneMask); + t.shfl_bfly(laneMask); r0 = fr_t::csel(t, r0, !pos); r1 = fr_t::csel(t, r1, pos); #endif @@ -89,12 +89,11 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, unsigned int nbits = MAX_LG_DOMAIN_SIZE - (stage - iterations); index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos; - index_t root_idx1 = thread_ntt_pos << (nbits - 1); + index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1)); fr_t first_root, second_root; get_intermediate_roots(first_root, second_root, root_idx0, root_idx1, d_partial_twiddles); - second_root *= first_root; r0 *= first_root; r1 *= second_root; @@ -120,7 +119,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations)); + index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations); index_t rotw = idx0 & mask; rotw = (rotw << 1) | (rotw >> (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh index 734a9a5..d783927 100644 --- a/ntt/ntt.cuh +++ b/ntt/ntt.cuh @@ -57,32 +57,31 @@ protected: private: static void LDE_powers(fr_t* inout, bool innt, bool bitrev, - uint32_t lg_domain_size, uint32_t lg_blowup, - stream_t& stream, bool ext_pow = false) + uint32_t lg_dsz, uint32_t lg_blowup, + stream_t& stream) { - size_t domain_size = (size_t)1 << lg_domain_size; + size_t domain_size = (size_t)1 << lg_dsz; const auto gen_powers = NTTParameters::all(innt)[stream]->partial_group_gen_powers; if (domain_size < WARP_SZ) LDE_distribute_powers<<<1, domain_size, 0, stream>>> - (inout, lg_blowup, bitrev, gen_powers, ext_pow); - else if (domain_size < 512) + (inout, lg_dsz, lg_blowup, bitrev, gen_powers); + else if (lg_dsz < 32) LDE_distribute_powers<<>> - (inout, lg_blowup, bitrev, gen_powers, ext_pow); + (inout, lg_dsz, lg_blowup, bitrev, gen_powers); else - LDE_distribute_powers<<>> - (inout, lg_blowup, bitrev, gen_powers, ext_pow); + LDE_distribute_powers<<>> + (inout, lg_dsz, lg_blowup, bitrev, gen_powers); CUDA_OK(cudaGetLastError()); } protected: - // coset_ext_pow is only used when NTT type is coset static void NTT_internal(fr_t* d_inout, uint32_t lg_domain_size, InputOutputOrder order, Direction direction, - Type type, stream_t& stream, - bool coset_ext_pow = false) + Type type, stream_t& stream) { // Pick an NTT algorithm based on the input order and the desired output // order of the data. In certain cases, bit reversal can be avoided which @@ -116,8 +115,7 @@ protected: } if (!intt && type == Type::coset) - LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream, - coset_ext_pow); + LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream); switch (algorithm) { case Algorithm::GS: @@ -129,8 +127,7 @@ protected: } if (intt && type == Type::coset) - LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream, - coset_ext_pow); + LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream); if (order == InputOutputOrder::RR) bit_rev(d_inout, d_inout, lg_domain_size, stream); @@ -139,7 +136,7 @@ protected: public: static RustError Base(const gpu_t& gpu, fr_t* inout, uint32_t lg_domain_size, InputOutputOrder order, Direction direction, - Type type, bool coset_ext_pow = false) + Type type) { if (lg_domain_size == 0) return RustError{cudaSuccess}; @@ -151,8 +148,7 @@ public: dev_ptr_t d_inout{domain_size, gpu}; gpu.HtoD(&d_inout[0], inout, domain_size); - NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu, - coset_ext_pow); + NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu); gpu.DtoH(inout, &d_inout[0], domain_size); gpu.sync(); @@ -168,48 +164,6 @@ public: return RustError{cudaSuccess}; } - static RustError LDE(const gpu_t& gpu, fr_t* inout, - uint32_t lg_domain_size, uint32_t lg_blowup, - bool ext_pow = false) - { - try { - gpu.select(); - - size_t domain_size = (size_t)1 << lg_domain_size; - size_t ext_domain_size = domain_size << lg_blowup; - dev_ptr_t d_ext_domain{ext_domain_size, gpu}; - fr_t* d_domain = &d_ext_domain[ext_domain_size - domain_size]; - - gpu.HtoD(&d_domain[0], inout, domain_size); - - NTT_internal(&d_domain[0], lg_domain_size, - InputOutputOrder::NR, Direction::inverse, - Type::standard, gpu); - - const auto gen_powers = - NTTParameters::all()[gpu.id()]->partial_group_gen_powers; - - LDE_launch(gpu, &d_ext_domain[0], &d_domain[0], - gen_powers, lg_domain_size, lg_blowup, ext_pow); - - NTT_internal(&d_ext_domain[0], lg_domain_size + lg_blowup, - InputOutputOrder::RN, Direction::forward, - Type::standard, gpu); - - gpu.DtoH(inout, &d_ext_domain[0], ext_domain_size); - gpu.sync(); - } catch (const cuda_error& e) { - gpu.sync(); -#ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE - return RustError{e.code(), e.what()}; -#else - return RustError{e.code()}; -#endif - } - - return RustError{cudaSuccess}; - } - protected: static void LDE_launch(stream_t& stream, fr_t* ext_domain_data, fr_t* domain_data, @@ -252,17 +206,19 @@ protected: public: static RustError LDE_aux(const gpu_t& gpu, fr_t* inout, uint32_t lg_domain_size, uint32_t lg_blowup, - bool ext_pow = false) + fr_t *aux_out = nullptr) { try { size_t domain_size = (size_t)1 << lg_domain_size; size_t ext_domain_size = domain_size << lg_blowup; + size_t aux_size = aux_out != nullptr ? domain_size : 0; // The 2nd to last 'domain_size' chunk will hold the original data // The last chunk will get the bit reversed iNTT data - dev_ptr_t d_inout{ext_domain_size + domain_size, gpu}; // + domain_size for aux buffer + dev_ptr_t d_inout{ext_domain_size + aux_size, gpu}; // + domain_size for aux buffer fr_t* aux_data = &d_inout[ext_domain_size]; fr_t* domain_data = &d_inout[ext_domain_size - domain_size]; // aligned to the end fr_t* ext_domain_data = &d_inout[0]; + gpu.HtoD(domain_data, inout, domain_size); NTT_internal(domain_data, lg_domain_size, @@ -272,17 +228,26 @@ public: const auto gen_powers = NTTParameters::all()[gpu.id()]->partial_group_gen_powers; - bit_rev(aux_data, domain_data, lg_domain_size, gpu); + event_t sync_event; + + if (aux_out != nullptr) { + bit_rev(aux_data, domain_data, lg_domain_size, gpu); + sync_event.record(gpu); + } LDE_launch(gpu, ext_domain_data, domain_data, gen_powers, - lg_domain_size, lg_blowup, true, ext_pow); + lg_domain_size, lg_blowup); // NTT - RN NTT_internal(ext_domain_data, lg_domain_size + lg_blowup, InputOutputOrder::RN, Direction::forward, Type::standard, gpu); - gpu.DtoH(inout, ext_domain_data, domain_size << lg_blowup); + if (aux_out != nullptr) { + sync_event.wait(gpu[0]); + gpu[0].DtoH(aux_out, aux_data, aux_size); + } + gpu.DtoH(inout, ext_domain_data, ext_domain_size); gpu.sync(); } catch (const cuda_error& e) { gpu.sync(); @@ -296,22 +261,24 @@ public: return RustError{cudaSuccess}; } - // coset_ext_pow is only used when NTT type is coset + static RustError LDE(const gpu_t& gpu, fr_t* inout, + uint32_t lg_domain_size, uint32_t lg_blowup) + { return LDE_aux(gpu, inout, lg_domain_size, lg_blowup); } + static void Base_dev_ptr(stream_t& stream, fr_t* d_inout, uint32_t lg_domain_size, InputOutputOrder order, - Direction direction, Type type, - bool coset_ext_pow = false) + Direction direction, Type type) { size_t domain_size = (size_t)1 << lg_domain_size; NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, - stream, coset_ext_pow); + stream); } static void LDE_powers(stream_t& stream, fr_t* d_inout, - uint32_t lg_domain_size, bool ext_pow = false) + uint32_t lg_domain_size) { - LDE_powers(d_inout, false, true, lg_domain_size, 0, stream, ext_pow); + LDE_powers(d_inout, false, true, lg_domain_size, 0, stream); } // If d_out and d_in overlap, d_out is expected to encompass d_in and diff --git a/poc/msm-cuda/Cargo.toml b/poc/msm-cuda/Cargo.toml index 6b606b7..226e580 100644 --- a/poc/msm-cuda/Cargo.toml +++ b/poc/msm-cuda/Cargo.toml @@ -2,6 +2,7 @@ name = "msm-cuda" version = "0.1.0" edition = "2021" +publish = false [features] # By default, compile with ADX extension if the host supports it. @@ -37,7 +38,6 @@ which = "^4.0" [dev-dependencies] criterion = { version = "0.3", features = [ "html_reports" ] } -rayon = "1.5" [[bench]] name = "msm" diff --git a/poc/ntt-cuda/Cargo.toml b/poc/ntt-cuda/Cargo.toml index fdd1822..3498b5d 100644 --- a/poc/ntt-cuda/Cargo.toml +++ b/poc/ntt-cuda/Cargo.toml @@ -32,8 +32,6 @@ sppark = { path = "../../rust" } cc = "^1.0.70" [dev-dependencies] -criterion = { version = "0.3", features = [ "html_reports" ] } -rayon = "1.5" rand = "^0" ark-std = "0.3.0" ark-ff = "0.3.0" diff --git a/poc/ntt-cuda/cuda/ntt_api.cu b/poc/ntt-cuda/cuda/ntt_api.cu index 3ca32fe..38f4d78 100644 --- a/poc/ntt-cuda/cuda/ntt_api.cu +++ b/poc/ntt-cuda/cuda/ntt_api.cu @@ -27,10 +27,11 @@ #ifndef __CUDA_ARCH__ extern "C" -RustError compute_ntt(size_t device_id, fr_t* inout, uint32_t lg_domain_size, - NTT::InputOutputOrder ntt_order, - NTT::Direction ntt_direction, - NTT::Type ntt_type) +RustError::by_value compute_ntt(size_t device_id, + fr_t* inout, uint32_t lg_domain_size, + NTT::InputOutputOrder ntt_order, + NTT::Direction ntt_direction, + NTT::Type ntt_type) { auto& gpu = select_gpu(device_id); diff --git a/poc/ntt-cuda/src/lib.rs b/poc/ntt-cuda/src/lib.rs index 6847b4e..92162a1 100644 --- a/poc/ntt-cuda/src/lib.rs +++ b/poc/ntt-cuda/src/lib.rs @@ -2,27 +2,7 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -sppark::cuda_error!(); - -#[repr(C)] -pub enum NTTInputOutputOrder { - NN = 0, - NR = 1, - RN = 2, - RR = 3, -} - -#[repr(C)] -enum NTTDirection { - Forward = 0, - Inverse = 1, -} - -#[repr(C)] -enum NTTType { - Standard = 0, - Coset = 1, -} +use sppark::{NTTInputOutputOrder, NTTDirection, NTTType}; extern "C" { fn compute_ntt( @@ -32,7 +12,7 @@ extern "C" { ntt_order: NTTInputOutputOrder, ntt_direction: NTTDirection, ntt_type: NTTType, - ) -> cuda::Error; + ) -> sppark::Error; } /// Compute an in-place NTT on the input data. diff --git a/poc/ntt-cuda/tests/ntt.rs b/poc/ntt-cuda/tests/ntt.rs index 9f69a26..bb8c1ba 100644 --- a/poc/ntt-cuda/tests/ntt.rs +++ b/poc/ntt-cuda/tests/ntt.rs @@ -2,7 +2,7 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -use ntt_cuda::NTTInputOutputOrder; +use sppark::NTTInputOutputOrder; const DEFAULT_GPU: usize = 0; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index a83f5ad..f047ff0 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -93,3 +93,23 @@ impl Clone for Gpu_Ptr { unsafe { transmute::<_, _>(clone_gpu_ptr_t(transmute::<&_, &_>(self))) } } } + +#[repr(C)] +pub enum NTTInputOutputOrder { + NN = 0, + NR = 1, + RN = 2, + RR = 3, +} + +#[repr(C)] +pub enum NTTDirection { + Forward = 0, + Inverse = 1, +} + +#[repr(C)] +pub enum NTTType { + Standard = 0, + Coset = 1, +} diff --git a/util/gpu_t.cuh b/util/gpu_t.cuh index e9e5eda..008a1f6 100644 --- a/util/gpu_t.cuh +++ b/util/gpu_t.cuh @@ -299,7 +299,7 @@ public: inline operator T*() const { return ptr->ptr; } // facilitate return by value through FFI, as gpu_ptr_t::by_value. - struct by_value { inner *ptr; }; + using by_value = struct { inner *ptr; }; operator by_value() const { ptr->ref_cnt.fetch_add(1, std::memory_order_relaxed); return {ptr}; } gpu_ptr_t(by_value v) { ptr = v.ptr; } diff --git a/util/rusterror.h b/util/rusterror.h index 9e8996e..02c0aff 100644 --- a/util/rusterror.h +++ b/util/rusterror.h @@ -24,7 +24,7 @@ struct RustError { /* to be returned exclusively by value */ { message = str==nullptr ? nullptr : strdup(str); } // no destructor[!], Rust takes care of the |message| - struct by_value { + using by_value = struct { int code; char *message; }; diff --git a/util/thread_pool_t.hpp b/util/thread_pool_t.hpp index 5ab7385..ff394a6 100644 --- a/util/thread_pool_t.hpp +++ b/util/thread_pool_t.hpp @@ -127,6 +127,8 @@ class thread_pool_t { return; } +#else + (void)affinity_env; #endif init(0); }