From 3dacda2ecb72c12f97c9a579343ef6c7ecfdca35 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 6 Dec 2023 13:01:07 +0100 Subject: [PATCH 01/20] poc/ntt-cuda/cuda/ntt_api.cu: resolve clang warning. --- poc/ntt-cuda/cuda/ntt_api.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/poc/ntt-cuda/cuda/ntt_api.cu b/poc/ntt-cuda/cuda/ntt_api.cu index 3ca32fe..38f4d78 100644 --- a/poc/ntt-cuda/cuda/ntt_api.cu +++ b/poc/ntt-cuda/cuda/ntt_api.cu @@ -27,10 +27,11 @@ #ifndef __CUDA_ARCH__ extern "C" -RustError compute_ntt(size_t device_id, fr_t* inout, uint32_t lg_domain_size, - NTT::InputOutputOrder ntt_order, - NTT::Direction ntt_direction, - NTT::Type ntt_type) +RustError::by_value compute_ntt(size_t device_id, + fr_t* inout, uint32_t lg_domain_size, + NTT::InputOutputOrder ntt_order, + NTT::Direction ntt_direction, + NTT::Type ntt_type) { auto& gpu = select_gpu(device_id); From 71a96567675847b1e5e372cf53ae3731abe4a733 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 6 Dec 2023 13:01:47 +0100 Subject: [PATCH 02/20] ff/baby_bear.hpp: resolve clang compilation failure. clang apparently casts the operator-matching net too wide and confuses bb31_t x^pow, where pow is unsigned, with operator^(int,int). Just in case, recall that for the time being baby_bear.hpp facilitates syntax vetting by the host compiler, the CUDA code is generated by cicc. --- ff/baby_bear.hpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ff/baby_bear.hpp b/ff/baby_bear.hpp index cce438a..c7439f0 100644 --- a/ff/baby_bear.hpp +++ b/ff/baby_bear.hpp @@ -27,18 +27,18 @@ class bb31_t { // this is used in constant declaration, e.g. as bb31_t{11} inline constexpr bb31_t(int a) : val(((uint64_t)a << 32) % MOD) {} - static inline const bb31_t one() { return bb31_t(1); } - inline bb31_t& operator+=(bb31_t b) { return *this; } - inline bb31_t& operator-=(bb31_t b) { return *this; } - inline bb31_t& operator*=(bb31_t b) { return *this; } - inline bb31_t& operator^=(int b) { return *this; } - inline bb31_t& sqr() { return *this; } - friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b; } - friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b; } - friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b; } - friend bb31_t operator^(bb31_t a, int b) { return a ^= b; } - inline void zero() { val = 0; } - inline bool is_zero() const { return val==0; } + static inline const bb31_t one() { return bb31_t(1); } + inline bb31_t& operator+=(bb31_t b) { return *this; } + inline bb31_t& operator-=(bb31_t b) { return *this; } + inline bb31_t& operator*=(bb31_t b) { return *this; } + inline bb31_t& operator^=(int b) { return *this; } + inline bb31_t& sqr() { return *this; } + friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b; } + friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b; } + friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b; } + friend bb31_t operator^(bb31_t a, uint32_t b) { return a ^= b; } + inline void zero() { val = 0; } + inline bool is_zero() const { return val==0; } inline operator uint32_t() const { return ((val*M)*(uint64_t)MOD + val) >> 32; } inline void to() { val = ((uint64_t)val<<32) % MOD; } From 62ba663b73b2f6b7a0ff5f02683bcbfa9c30d0d5 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 6 Dec 2023 13:15:49 +0100 Subject: [PATCH 03/20] Add .github/workflows/ci.yml to exercise PoCs' build procedures. --- .github/workflows/ci.yml | 83 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..17a213a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,83 @@ +name: build + +on: + push: + branches: + - '**' + workflow_dispatch: + branches: + - '**' + pull_request: + branches: + - main + +jobs: + cargo-test-no-run: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Get date + id: get-date + run: echo "date=$(date -u +%Y-%m)" >> $GITHUB_OUTPUT + shell: bash + + - uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry + **/Cargo.lock + **/target + /usr/local/cuda-12.3 + key: ${{ runner.os }}-cargo-${{ steps.get-date.outputs.date }} + + - name: Environment + shell: bash + run: | + lscpu 2>/dev/null && echo --- || true + env | sort + + - name: Install cuda-minimal-build-12-3 + shell: bash + run: | + if [ ! -d /usr/local/cuda-12.3 ]; then + # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get -y install cuda-minimal-build-12-3 + fi + [ -d /usr/local/cuda-12.3/bin ] + + - name: The test + shell: bash + run: | + rustc --version --verbose + export PATH=$PATH:/usr/local/cuda-12.3/bin + ( cd poc/ntt-cuda + cargo test --release --no-run --features=bls12_381 + cargo test --release --no-run --features=gl64 + cargo test --release --no-run --features=bb31 + export CXX=clang++ + cargo test --release --no-run --features=bls12_381 + cargo test --release --no-run --features=gl64 + cargo test --release --no-run --features=bb31 + cargo clean -p ntt-cuda + cargo clean -p ntt-cuda --release + ) + ( cd poc/msm-cuda + cargo test --release --no-run --features=bls12_381 + cargo test --release --no-run --features=bn254 + export CXX=clang++ + cargo test --release --no-run --features=bls12_381 + cargo test --release --no-run --features=bn254 + cargo clean -p msm-cuda + cargo clean -p msm-cuda --release + ) + rm -rf poc/*/target/.rustc_info.json + rm -rf poc/*/target/package + rm -rf poc/*/target/{debug,release}/incremental + rm -rf poc/*/target/*/{debug,release}/incremental + rm -rf ~/.cargo/registry/src + rm -rf ~/.cargo/registry/index/*/.cache From 121c757ff31f0a4fa9a0920b4876a28431875c91 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 6 Dec 2023 13:28:25 +0100 Subject: [PATCH 04/20] poc/ntt-cuda/Cargo.toml: drop unused dev-dependencies. --- .github/workflows/ci.yml | 77 ++++++++++++++++++++++++++-------------- poc/ntt-cuda/Cargo.toml | 2 -- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17a213a..cea979d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: build +name: test-build on: push: @@ -29,7 +29,6 @@ jobs: ~/.cargo/registry **/Cargo.lock **/target - /usr/local/cuda-12.3 key: ${{ runner.os }}-cargo-${{ steps.get-date.outputs.date }} - name: Environment @@ -41,43 +40,69 @@ jobs: - name: Install cuda-minimal-build-12-3 shell: bash run: | - if [ ! -d /usr/local/cuda-12.3 ]; then - # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb - sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt-get update - sudo apt-get -y install cuda-minimal-build-12-3 - fi + # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get -y install cuda-minimal-build-12-3 [ -d /usr/local/cuda-12.3/bin ] - - name: The test + - name: Test-build poc/ntt-cuda shell: bash run: | rustc --version --verbose export PATH=$PATH:/usr/local/cuda-12.3/bin ( cd poc/ntt-cuda - cargo test --release --no-run --features=bls12_381 - cargo test --release --no-run --features=gl64 - cargo test --release --no-run --features=bb31 - export CXX=clang++ - cargo test --release --no-run --features=bls12_381 - cargo test --release --no-run --features=gl64 - cargo test --release --no-run --features=bb31 + cargo update + cargo test --no-run --release --features=bls12_381 + cargo test --no-run --release --features=gl64 + cargo test --no-run --release --features=bb31 + if which clang++ 2>/dev/null; then + echo + echo Testing with clang++ + echo + clang++ --version + echo + export CXX=clang++ + cargo test --no-run --release --features=bls12_381 + cargo test --no-run --release --features=gl64 + cargo test --no-run --release --features=bb31 + fi cargo clean -p ntt-cuda cargo clean -p ntt-cuda --release + rm -rf target/.rustc_info.json + rm -rf target/package + rm -rf target/{debug,release}/incremental + rm -rf target/*/{debug,release}/incremental ) + + - name: Test-build poc/msm-cuda + shell: bash + run: | + rustc --version --verbose + export PATH=$PATH:/usr/local/cuda-12.3/bin ( cd poc/msm-cuda - cargo test --release --no-run --features=bls12_381 - cargo test --release --no-run --features=bn254 - export CXX=clang++ - cargo test --release --no-run --features=bls12_381 - cargo test --release --no-run --features=bn254 + sed "s/^crit/#crit/" Cargo.toml > Cargo.$$.toml && \ + mv Cargo.$$.toml Cargo.toml + cargo update + cargo test --no-run --release --features=bls12_381,quiet + cargo test --no-run --release --features=bn254,quiet + if which clang++ 2>/dev/null; then + echo + echo Testing with clang++ + echo + clang++ --version + echo + export CXX=clang++ + cargo test --no-run --release --features=bls12_381,quiet + cargo test --no-run --release --features=bn254,quiet + fi cargo clean -p msm-cuda cargo clean -p msm-cuda --release + rm -rf target/.rustc_info.json + rm -rf target/package + rm -rf target/{debug,release}/incremental + rm -rf target/*/{debug,release}/incremental ) - rm -rf poc/*/target/.rustc_info.json - rm -rf poc/*/target/package - rm -rf poc/*/target/{debug,release}/incremental - rm -rf poc/*/target/*/{debug,release}/incremental rm -rf ~/.cargo/registry/src rm -rf ~/.cargo/registry/index/*/.cache diff --git a/poc/ntt-cuda/Cargo.toml b/poc/ntt-cuda/Cargo.toml index fdd1822..3498b5d 100644 --- a/poc/ntt-cuda/Cargo.toml +++ b/poc/ntt-cuda/Cargo.toml @@ -32,8 +32,6 @@ sppark = { path = "../../rust" } cc = "^1.0.70" [dev-dependencies] -criterion = { version = "0.3", features = [ "html_reports" ] } -rayon = "1.5" rand = "^0" ark-std = "0.3.0" ark-ff = "0.3.0" From b8d0005a83f92c0a55a8fe0d8ad0d8f5968a8c30 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 28 Nov 2023 13:37:48 +0100 Subject: [PATCH 05/20] ntt/kernels/*_wide.cu: omit one multiplication. --- ntt/kernels/ct_mixed_radix_wide.cu | 3 +-- ntt/kernels/gs_mixed_radix_wide.cu | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu index 11366ad..efa1a58 100644 --- a/ntt/kernels/ct_mixed_radix_wide.cu +++ b/ntt/kernels/ct_mixed_radix_wide.cu @@ -43,12 +43,11 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, unsigned int nbits = MAX_LG_DOMAIN_SIZE - stage; index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos; - index_t root_idx1 = thread_ntt_pos << (nbits - 1); + index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1)); fr_t first_root, second_root; get_intermediate_roots(first_root, second_root, root_idx0, root_idx1, d_partial_twiddles); - second_root *= first_root; r0 *= first_root; r1 *= second_root; diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu index f5cd4bb..b6b94d4 100644 --- a/ntt/kernels/gs_mixed_radix_wide.cu +++ b/ntt/kernels/gs_mixed_radix_wide.cu @@ -89,12 +89,11 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, unsigned int nbits = MAX_LG_DOMAIN_SIZE - (stage - iterations); index_t root_idx0 = bit_rev(thread_ntt_idx, nbits) * thread_ntt_pos; - index_t root_idx1 = thread_ntt_pos << (nbits - 1); + index_t root_idx1 = root_idx0 + (thread_ntt_pos << (nbits - 1)); fr_t first_root, second_root; get_intermediate_roots(first_root, second_root, root_idx0, root_idx1, d_partial_twiddles); - second_root *= first_root; r0 *= first_root; r1 *= second_root; From 8ae27a6ce4586323eb383031c317bd4ec88c4895 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 28 Nov 2023 13:38:32 +0100 Subject: [PATCH 06/20] ntt/kernels.cu: optimize some roots' calculations. --- ntt/kernels.cu | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/ntt/kernels.cu b/ntt/kernels.cu index 208c961..c45306a 100644 --- a/ntt/kernels.cu +++ b/ntt/kernels.cu @@ -142,7 +142,31 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE], { unsigned int off = 0; - fr_t t, root = roots[off][pow % WINDOW_SIZE]; + fr_t t, root; + + if (sizeof(fr_t) <= 8) { + root = fr_t::one(); + bool root_set = false; + + #pragma unroll + for (unsigned int pow_win, i = 0; i < WINDOW_NUM; i++) { + if (!root_set && (pow_win = pow % WINDOW_SIZE)) { + root = roots[i][pow_win]; + root_set = true; + } + if (!root_set) { + pow >>= LG_WINDOW_SIZE; + off++; + } + } + } else { + if ((pow % WINDOW_SIZE) == 0) { + pow >>= LG_WINDOW_SIZE; + off++; + } + root = roots[off][pow % WINDOW_SIZE]; + } + #pragma unroll 1 while (pow >>= LG_WINDOW_SIZE) root *= (t = roots[++off][pow % WINDOW_SIZE]); @@ -252,13 +276,16 @@ void get_intermediate_roots(fr_t& root0, fr_t& root1, { int win = (WINDOW_NUM - 1) * LG_WINDOW_SIZE; int off = (WINDOW_NUM - 1); + index_t idxo = idx0 | idx1; + index_t mask = ((index_t)1 << win) - 1; root0 = roots[off][idx0 >> win]; root1 = roots[off][idx1 >> win]; #pragma unroll 1 - while (off--) { + while (off-- && (idxo & mask)) { fr_t t; win -= LG_WINDOW_SIZE; + mask >>= LG_WINDOW_SIZE; root0 *= (t = roots[off][(idx0 >> win) % WINDOW_SIZE]); root1 *= (t = roots[off][(idx1 >> win) % WINDOW_SIZE]); } From 62045e72d1bd57f8f28f9ff4d6b8003e8f7e129d Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 27 Nov 2023 17:55:30 +0100 Subject: [PATCH 07/20] ntt/{kernels.cu,ntt.cuh}: omit redundant argument. In the LDE_powers() case the operation is fully described by |lg_blowup| parameter, and in the NTT_internal() case - by the |type| parameter. --- ntt/kernels.cu | 6 ++---- ntt/ntt.cuh | 43 +++++++++++++++++-------------------------- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/ntt/kernels.cu b/ntt/kernels.cu index c45306a..3d78821 100644 --- a/ntt/kernels.cu +++ b/ntt/kernels.cu @@ -176,8 +176,7 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE], __launch_bounds__(1024) __global__ void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev, - const fr_t (*gen_powers)[WINDOW_SIZE], - bool ext_pow = false) + const fr_t (*gen_powers)[WINDOW_SIZE]) { index_t idx = threadIdx.x + blockDim.x * (index_t)blockIdx.x; index_t pow = idx; @@ -191,8 +190,7 @@ void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev, pow = bit_rev(idx, lg_domain_size); } - if (ext_pow) - pow <<= lg_blowup; + pow <<= lg_blowup; r = r * get_intermediate_root(pow, gen_powers); diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh index 734a9a5..ca4fcd5 100644 --- a/ntt/ntt.cuh +++ b/ntt/ntt.cuh @@ -58,7 +58,7 @@ protected: private: static void LDE_powers(fr_t* inout, bool innt, bool bitrev, uint32_t lg_domain_size, uint32_t lg_blowup, - stream_t& stream, bool ext_pow = false) + stream_t& stream) { size_t domain_size = (size_t)1 << lg_domain_size; const auto gen_powers = @@ -66,23 +66,21 @@ private: if (domain_size < WARP_SZ) LDE_distribute_powers<<<1, domain_size, 0, stream>>> - (inout, lg_blowup, bitrev, gen_powers, ext_pow); + (inout, lg_blowup, bitrev, gen_powers); else if (domain_size < 512) LDE_distribute_powers<<>> - (inout, lg_blowup, bitrev, gen_powers, ext_pow); + (inout, lg_blowup, bitrev, gen_powers); else LDE_distribute_powers<<>> - (inout, lg_blowup, bitrev, gen_powers, ext_pow); + (inout, lg_blowup, bitrev, gen_powers); CUDA_OK(cudaGetLastError()); } protected: - // coset_ext_pow is only used when NTT type is coset static void NTT_internal(fr_t* d_inout, uint32_t lg_domain_size, InputOutputOrder order, Direction direction, - Type type, stream_t& stream, - bool coset_ext_pow = false) + Type type, stream_t& stream) { // Pick an NTT algorithm based on the input order and the desired output // order of the data. In certain cases, bit reversal can be avoided which @@ -116,8 +114,7 @@ protected: } if (!intt && type == Type::coset) - LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream, - coset_ext_pow); + LDE_powers(d_inout, intt, bitrev, lg_domain_size, 0, stream); switch (algorithm) { case Algorithm::GS: @@ -129,8 +126,7 @@ protected: } if (intt && type == Type::coset) - LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream, - coset_ext_pow); + LDE_powers(d_inout, intt, !bitrev, lg_domain_size, 0, stream); if (order == InputOutputOrder::RR) bit_rev(d_inout, d_inout, lg_domain_size, stream); @@ -139,7 +135,7 @@ protected: public: static RustError Base(const gpu_t& gpu, fr_t* inout, uint32_t lg_domain_size, InputOutputOrder order, Direction direction, - Type type, bool coset_ext_pow = false) + Type type) { if (lg_domain_size == 0) return RustError{cudaSuccess}; @@ -151,8 +147,7 @@ public: dev_ptr_t d_inout{domain_size, gpu}; gpu.HtoD(&d_inout[0], inout, domain_size); - NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu, - coset_ext_pow); + NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, gpu); gpu.DtoH(inout, &d_inout[0], domain_size); gpu.sync(); @@ -169,8 +164,7 @@ public: } static RustError LDE(const gpu_t& gpu, fr_t* inout, - uint32_t lg_domain_size, uint32_t lg_blowup, - bool ext_pow = false) + uint32_t lg_domain_size, uint32_t lg_blowup) { try { gpu.select(); @@ -190,7 +184,7 @@ public: NTTParameters::all()[gpu.id()]->partial_group_gen_powers; LDE_launch(gpu, &d_ext_domain[0], &d_domain[0], - gen_powers, lg_domain_size, lg_blowup, ext_pow); + gen_powers, lg_domain_size, lg_blowup); NTT_internal(&d_ext_domain[0], lg_domain_size + lg_blowup, InputOutputOrder::RN, Direction::forward, @@ -251,8 +245,7 @@ protected: public: static RustError LDE_aux(const gpu_t& gpu, fr_t* inout, - uint32_t lg_domain_size, uint32_t lg_blowup, - bool ext_pow = false) + uint32_t lg_domain_size, uint32_t lg_blowup) { try { size_t domain_size = (size_t)1 << lg_domain_size; @@ -275,7 +268,7 @@ public: bit_rev(aux_data, domain_data, lg_domain_size, gpu); LDE_launch(gpu, ext_domain_data, domain_data, gen_powers, - lg_domain_size, lg_blowup, true, ext_pow); + lg_domain_size, lg_blowup); // NTT - RN NTT_internal(ext_domain_data, lg_domain_size + lg_blowup, @@ -296,22 +289,20 @@ public: return RustError{cudaSuccess}; } - // coset_ext_pow is only used when NTT type is coset static void Base_dev_ptr(stream_t& stream, fr_t* d_inout, uint32_t lg_domain_size, InputOutputOrder order, - Direction direction, Type type, - bool coset_ext_pow = false) + Direction direction, Type type) { size_t domain_size = (size_t)1 << lg_domain_size; NTT_internal(&d_inout[0], lg_domain_size, order, direction, type, - stream, coset_ext_pow); + stream); } static void LDE_powers(stream_t& stream, fr_t* d_inout, - uint32_t lg_domain_size, bool ext_pow = false) + uint32_t lg_domain_size) { - LDE_powers(d_inout, false, true, lg_domain_size, 0, stream, ext_pow); + LDE_powers(d_inout, false, true, lg_domain_size, 0, stream); } // If d_out and d_in overlap, d_out is expected to encompass d_in and From a6799d9a8472dbd451b98a04dec9ad099fb7471a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 27 Nov 2023 18:04:35 +0100 Subject: [PATCH 08/20] ntt/{kernels.cu,ntt.cuh}: ensure LDE_powers works even for large domain sizes. --- ntt/kernels.cu | 30 +++++++++++++++--------------- ntt/ntt.cuh | 15 ++++++++------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/ntt/kernels.cu b/ntt/kernels.cu index 3d78821..da000ee 100644 --- a/ntt/kernels.cu +++ b/ntt/kernels.cu @@ -175,26 +175,26 @@ fr_t get_intermediate_root(index_t pow, const fr_t (*roots)[WINDOW_SIZE], } __launch_bounds__(1024) __global__ -void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_blowup, bool bitrev, +void LDE_distribute_powers(fr_t* d_inout, uint32_t lg_domain_size, + uint32_t lg_blowup, bool bitrev, const fr_t (*gen_powers)[WINDOW_SIZE]) { - index_t idx = threadIdx.x + blockDim.x * (index_t)blockIdx.x; - index_t pow = idx; - fr_t r = d_inout[idx]; - - if (bitrev) { - size_t domain_size = gridDim.x * (size_t)blockDim.x; - assert((domain_size & (domain_size-1)) == 0); - uint32_t lg_domain_size = 63 - __clzll(domain_size); - - pow = bit_rev(idx, lg_domain_size); - } +#if 0 + assert(blockDim.x * gridDim.x == blockDim.x * (size_t)gridDim.x); +#endif + size_t domain_size = (size_t)1 << lg_domain_size; + index_t idx = threadIdx.x + blockDim.x * blockIdx.x; - pow <<= lg_blowup; + #pragma unroll 1 + for (; idx < domain_size; idx += blockDim.x * gridDim.x) { + fr_t r = d_inout[idx]; - r = r * get_intermediate_root(pow, gen_powers); + index_t pow = bitrev ? bit_rev(idx, lg_domain_size) : idx; + pow <<= lg_blowup; + r *= get_intermediate_root(pow, gen_powers); - d_inout[idx] = r; + d_inout[idx] = r; + } } __launch_bounds__(1024) __global__ diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh index ca4fcd5..f68070e 100644 --- a/ntt/ntt.cuh +++ b/ntt/ntt.cuh @@ -57,22 +57,23 @@ protected: private: static void LDE_powers(fr_t* inout, bool innt, bool bitrev, - uint32_t lg_domain_size, uint32_t lg_blowup, + uint32_t lg_dsz, uint32_t lg_blowup, stream_t& stream) { - size_t domain_size = (size_t)1 << lg_domain_size; + size_t domain_size = (size_t)1 << lg_dsz; const auto gen_powers = NTTParameters::all(innt)[stream]->partial_group_gen_powers; if (domain_size < WARP_SZ) LDE_distribute_powers<<<1, domain_size, 0, stream>>> - (inout, lg_blowup, bitrev, gen_powers); - else if (domain_size < 512) + (inout, lg_dsz, lg_blowup, bitrev, gen_powers); + else if (lg_dsz < 32) LDE_distribute_powers<<>> - (inout, lg_blowup, bitrev, gen_powers); + (inout, lg_dsz, lg_blowup, bitrev, gen_powers); else - LDE_distribute_powers<<>> - (inout, lg_blowup, bitrev, gen_powers); + LDE_distribute_powers<<>> + (inout, lg_dsz, lg_blowup, bitrev, gen_powers); CUDA_OK(cudaGetLastError()); } From 3e9867c36544bb38bcd23703cd0050a5e39e5505 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 27 Nov 2023 18:14:12 +0100 Subject: [PATCH 09/20] ntt/ntt.cuh: make sense of |aux_data| in LDE_aux and de-duplicate LDE(). --- ntt/ntt.cuh | 65 +++++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh index f68070e..d783927 100644 --- a/ntt/ntt.cuh +++ b/ntt/ntt.cuh @@ -164,47 +164,6 @@ public: return RustError{cudaSuccess}; } - static RustError LDE(const gpu_t& gpu, fr_t* inout, - uint32_t lg_domain_size, uint32_t lg_blowup) - { - try { - gpu.select(); - - size_t domain_size = (size_t)1 << lg_domain_size; - size_t ext_domain_size = domain_size << lg_blowup; - dev_ptr_t d_ext_domain{ext_domain_size, gpu}; - fr_t* d_domain = &d_ext_domain[ext_domain_size - domain_size]; - - gpu.HtoD(&d_domain[0], inout, domain_size); - - NTT_internal(&d_domain[0], lg_domain_size, - InputOutputOrder::NR, Direction::inverse, - Type::standard, gpu); - - const auto gen_powers = - NTTParameters::all()[gpu.id()]->partial_group_gen_powers; - - LDE_launch(gpu, &d_ext_domain[0], &d_domain[0], - gen_powers, lg_domain_size, lg_blowup); - - NTT_internal(&d_ext_domain[0], lg_domain_size + lg_blowup, - InputOutputOrder::RN, Direction::forward, - Type::standard, gpu); - - gpu.DtoH(inout, &d_ext_domain[0], ext_domain_size); - gpu.sync(); - } catch (const cuda_error& e) { - gpu.sync(); -#ifdef TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE - return RustError{e.code(), e.what()}; -#else - return RustError{e.code()}; -#endif - } - - return RustError{cudaSuccess}; - } - protected: static void LDE_launch(stream_t& stream, fr_t* ext_domain_data, fr_t* domain_data, @@ -246,17 +205,20 @@ protected: public: static RustError LDE_aux(const gpu_t& gpu, fr_t* inout, - uint32_t lg_domain_size, uint32_t lg_blowup) + uint32_t lg_domain_size, uint32_t lg_blowup, + fr_t *aux_out = nullptr) { try { size_t domain_size = (size_t)1 << lg_domain_size; size_t ext_domain_size = domain_size << lg_blowup; + size_t aux_size = aux_out != nullptr ? domain_size : 0; // The 2nd to last 'domain_size' chunk will hold the original data // The last chunk will get the bit reversed iNTT data - dev_ptr_t d_inout{ext_domain_size + domain_size, gpu}; // + domain_size for aux buffer + dev_ptr_t d_inout{ext_domain_size + aux_size, gpu}; // + domain_size for aux buffer fr_t* aux_data = &d_inout[ext_domain_size]; fr_t* domain_data = &d_inout[ext_domain_size - domain_size]; // aligned to the end fr_t* ext_domain_data = &d_inout[0]; + gpu.HtoD(domain_data, inout, domain_size); NTT_internal(domain_data, lg_domain_size, @@ -266,7 +228,12 @@ public: const auto gen_powers = NTTParameters::all()[gpu.id()]->partial_group_gen_powers; - bit_rev(aux_data, domain_data, lg_domain_size, gpu); + event_t sync_event; + + if (aux_out != nullptr) { + bit_rev(aux_data, domain_data, lg_domain_size, gpu); + sync_event.record(gpu); + } LDE_launch(gpu, ext_domain_data, domain_data, gen_powers, lg_domain_size, lg_blowup); @@ -276,7 +243,11 @@ public: InputOutputOrder::RN, Direction::forward, Type::standard, gpu); - gpu.DtoH(inout, ext_domain_data, domain_size << lg_blowup); + if (aux_out != nullptr) { + sync_event.wait(gpu[0]); + gpu[0].DtoH(aux_out, aux_data, aux_size); + } + gpu.DtoH(inout, ext_domain_data, ext_domain_size); gpu.sync(); } catch (const cuda_error& e) { gpu.sync(); @@ -290,6 +261,10 @@ public: return RustError{cudaSuccess}; } + static RustError LDE(const gpu_t& gpu, fr_t* inout, + uint32_t lg_domain_size, uint32_t lg_blowup) + { return LDE_aux(gpu, inout, lg_domain_size, lg_blowup); } + static void Base_dev_ptr(stream_t& stream, fr_t* d_inout, uint32_t lg_domain_size, InputOutputOrder order, Direction direction, Type type) From d575d33c079224ff982d6e8505408c76d22e0f26 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 2 Jan 2024 12:38:43 +0100 Subject: [PATCH 10/20] poc/msm-cuda/Cargo.toml: clean up. --- poc/msm-cuda/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poc/msm-cuda/Cargo.toml b/poc/msm-cuda/Cargo.toml index 6b606b7..226e580 100644 --- a/poc/msm-cuda/Cargo.toml +++ b/poc/msm-cuda/Cargo.toml @@ -2,6 +2,7 @@ name = "msm-cuda" version = "0.1.0" edition = "2021" +publish = false [features] # By default, compile with ADX extension if the host supports it. @@ -37,7 +38,6 @@ which = "^4.0" [dev-dependencies] criterion = { version = "0.3", features = [ "html_reports" ] } -rayon = "1.5" [[bench]] name = "msm" From 892c112a32a7b7babf86a1fe73593bcf40337882 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 2 Jan 2024 12:39:25 +0100 Subject: [PATCH 11/20] util/thread_pool_t.hpp: resolve -Wunused-parameter on non-Linux. --- util/thread_pool_t.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/util/thread_pool_t.hpp b/util/thread_pool_t.hpp index 5ab7385..ff394a6 100644 --- a/util/thread_pool_t.hpp +++ b/util/thread_pool_t.hpp @@ -127,6 +127,8 @@ class thread_pool_t { return; } +#else + (void)affinity_env; #endif init(0); } From 60e5157a7467e4f126d859fce98bac5d2ba50945 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 2 Jan 2024 12:41:44 +0100 Subject: [PATCH 12/20] util/{gpu_t.cuh,rusterror.h}: make it compile on Windows. --- util/gpu_t.cuh | 2 +- util/rusterror.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/util/gpu_t.cuh b/util/gpu_t.cuh index e9e5eda..008a1f6 100644 --- a/util/gpu_t.cuh +++ b/util/gpu_t.cuh @@ -299,7 +299,7 @@ public: inline operator T*() const { return ptr->ptr; } // facilitate return by value through FFI, as gpu_ptr_t::by_value. - struct by_value { inner *ptr; }; + using by_value = struct { inner *ptr; }; operator by_value() const { ptr->ref_cnt.fetch_add(1, std::memory_order_relaxed); return {ptr}; } gpu_ptr_t(by_value v) { ptr = v.ptr; } diff --git a/util/rusterror.h b/util/rusterror.h index 9e8996e..02c0aff 100644 --- a/util/rusterror.h +++ b/util/rusterror.h @@ -24,7 +24,7 @@ struct RustError { /* to be returned exclusively by value */ { message = str==nullptr ? nullptr : strdup(str); } // no destructor[!], Rust takes care of the |message| - struct by_value { + using by_value = struct { int code; char *message; }; From c536f0964f7dba7d5be5d385d1ccbcaaa7f2bfab Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 2 Jan 2024 12:45:24 +0100 Subject: [PATCH 13/20] ff/*: add shfl_bfly() method. --- ff/bb31_t.cuh | 3 +++ ff/gl64_t.cuh | 3 +++ ff/mont_t.cuh | 7 +++++++ 3 files changed, 13 insertions(+) diff --git a/ff/bb31_t.cuh b/ff/bb31_t.cuh index 9e65c97..2bb3250 100644 --- a/ff/bb31_t.cuh +++ b/ff/bb31_t.cuh @@ -353,6 +353,9 @@ public: { return a * b.reciprocal(); } inline bb31_t& operator/=(const bb31_t a) { return *this *= a.reciprocal(); } + + inline void shfl_bfly(uint32_t laneMask) + { val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask); } }; # undef inline diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh index 0866a66..fd67640 100644 --- a/ff/gl64_t.cuh +++ b/ff/gl64_t.cuh @@ -608,6 +608,9 @@ public: return t1; } + + inline void shfl_bfly(uint32_t laneMask) + { val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask); } }; # undef inline diff --git a/ff/mont_t.cuh b/ff/mont_t.cuh index cc5c8ee..e4aad31 100644 --- a/ff/mont_t.cuh +++ b/ff/mont_t.cuh @@ -1109,6 +1109,13 @@ public: { return a * b.reciprocal(); } inline mont_t& operator/=(const mont_t& a) { return *this *= a.reciprocal(); } + + inline void shfl_bfly(uint32_t laneMask) + { + #pragma unroll + for (size_t i=0; i Date: Tue, 2 Jan 2024 12:46:44 +0100 Subject: [PATCH 14/20] ff/gl64_t.cuh: refactor reduce(). Better readability and marginal improvement in exponentiation. --- ff/gl64_t.cuh | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh index fd67640..9ea17d7 100644 --- a/ff/gl64_t.cuh +++ b/ff/gl64_t.cuh @@ -267,45 +267,28 @@ private: inline void reduce(uint32_t temp[4]) { - uint32_t carry; # if __CUDA_ARCH__ >= 700 - asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;" - : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry) - : "r"(temp[2]), "r"(temp[3])); - asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;" - : "+r"(temp[1]), "+r"(carry) - : "r"(temp[2]), "r"(temp[3])); - - asm("mad.lo.cc.u32 %0, %3, %4, %0; madc.hi.cc.u32 %1, %3, %4, %1; addc.u32 %2, 0, 0;" - : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2]) - : "r"(carry), "r"(gl64_device::W)); - asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;" - : "+r"(temp[0]), "+r"(temp[1]) - : "r"(temp[2]), "r"(gl64_device::W)); + asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1; addc.u32 %2, 0, 0;" + : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2]) + : "r"(gl64_device::W)); # else uint32_t b0, b1; - asm("add.cc.u32 %0, %2, %3; addc.u32 %1, 0, 0;" + + asm("sub.cc.u32 %0, 0, %2; subc.u32 %1, %2, 0;" : "=r"(b0), "=r"(b1) - : "r"(temp[2]), "r"(temp[3])); - asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;" - : "+r"(temp[0]), "+r"(temp[1]), "=r"(carry) + : "r"(temp[2])); + asm("add.cc.u32 %0, %0, %3; addc.cc.u32 %1, %1, %4; addc.u32 %2, 0, 0;" + : "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2]) : "r"(b0), "r"(b1)); - asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;" - : "+r"(temp[0]), "+r"(temp[1]) - : "r"(-carry), "r"(carry)); - asm("add.cc.u32 %0, %0, %1; addc.u32 %1, 0, 0;" - : "+r"(temp[1]), "+r"(temp[2])); +# endif + asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, 0; subc.u32 %2, %2, 0;" + : "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2]) + : "r"(temp[3])); -# if __CUDA_ARCH__ >= 700 - asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;" + asm("sub.cc.u32 %0, %0, %2; subc.u32 %1, %1, %3;" : "+r"(temp[0]), "+r"(temp[1]) - : "r"(temp[2]), "r"(gl64_device::W)); -# else - asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, 0;" - : "+r"(temp[0]), "+r"(temp[1]) - : "r"(-temp[2])); -# endif -# endif + : "r"(temp[2]), "r"(-(int)temp[2]>>1)); + asm("mov.b64 %0, {%1, %2};" : "=l"(val) : "r"(temp[0]), "r"(temp[1])); } From e5add3fa08908dcabcdb6c01e990b390cd21c294 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 2 Jan 2024 13:09:02 +0100 Subject: [PATCH 15/20] ntt/kernels/*: improve readability. --- ntt/kernels/ct_mixed_radix_narrow.cu | 2 +- ntt/kernels/ct_mixed_radix_wide.cu | 2 +- ntt/kernels/gs_mixed_radix_narrow.cu | 2 +- ntt/kernels/gs_mixed_radix_wide.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ntt/kernels/ct_mixed_radix_narrow.cu b/ntt/kernels/ct_mixed_radix_narrow.cu index d90e70b..1c72a4b 100644 --- a/ntt/kernels/ct_mixed_radix_narrow.cu +++ b/ntt/kernels/ct_mixed_radix_narrow.cu @@ -156,7 +156,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage); + index_t mask = (index_t)((1 << iterations) - 1) << stage; index_t rotw = idx0 & mask; rotw = (rotw >> 1) | (rotw << (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu index efa1a58..550eade 100644 --- a/ntt/kernels/ct_mixed_radix_wide.cu +++ b/ntt/kernels/ct_mixed_radix_wide.cu @@ -125,7 +125,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << (stage + iterations)) - ((index_t)1 << stage); + index_t mask = (index_t)((1 << iterations) - 1) << stage; index_t rotw = idx0 & mask; rotw = (rotw >> 1) | (rotw << (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/kernels/gs_mixed_radix_narrow.cu b/ntt/kernels/gs_mixed_radix_narrow.cu index 3f0946c..b8added 100644 --- a/ntt/kernels/gs_mixed_radix_narrow.cu +++ b/ntt/kernels/gs_mixed_radix_narrow.cu @@ -159,7 +159,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations)); + index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations); index_t rotw = idx0 & mask; rotw = (rotw << 1) | (rotw >> (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu index b6b94d4..7487e5d 100644 --- a/ntt/kernels/gs_mixed_radix_wide.cu +++ b/ntt/kernels/gs_mixed_radix_wide.cu @@ -119,7 +119,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, } // rotate "iterations" bits in indices - index_t mask = ((index_t)1 << stage) - ((index_t)1 << (stage - iterations)); + index_t mask = (index_t)((1 << iterations) - 1) << (stage - iterations); index_t rotw = idx0 & mask; rotw = (rotw << 1) | (rotw >> (iterations - 1)); idx0 = (idx0 & ~mask) | (rotw & mask); From c332cc9041f2f48d1e437c02eda75d25c3872ea3 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 2 Jan 2024 13:11:20 +0100 Subject: [PATCH 16/20] ntt/{kernels.cu,kernels/*}: switch to shfl_bfly() method and clean up. --- ntt/kernels.cu | 25 ------------------------- ntt/kernels/ct_mixed_radix_narrow.cu | 2 +- ntt/kernels/ct_mixed_radix_wide.cu | 2 +- ntt/kernels/gs_mixed_radix_narrow.cu | 2 +- ntt/kernels/gs_mixed_radix_wide.cu | 2 +- 5 files changed, 4 insertions(+), 29 deletions(-) diff --git a/ntt/kernels.cu b/ntt/kernels.cu index da000ee..ee68b28 100644 --- a/ntt/kernels.cu +++ b/ntt/kernels.cu @@ -7,31 +7,6 @@ #include -#ifdef __CUDA_ARCH__ -__device__ __forceinline__ -void shfl_bfly(fr_t& r, int laneMask) -{ - #pragma unroll - for (int iter = 0; iter < r.len(); iter++) - r[iter] = __shfl_xor_sync(0xFFFFFFFF, r[iter], laneMask); -} -#endif - -__device__ __forceinline__ -void shfl_bfly(index_t& index, int laneMask) -{ - index = __shfl_xor_sync(0xFFFFFFFF, index, laneMask); -} - -template -__device__ __forceinline__ -void swap(T& u1, T& u2) -{ - T temp = u1; - u1 = u2; - u2 = temp; -} - template __device__ __forceinline__ T bit_rev(T i, unsigned int nbits) diff --git a/ntt/kernels/ct_mixed_radix_narrow.cu b/ntt/kernels/ct_mixed_radix_narrow.cu index 1c72a4b..1f008df 100644 --- a/ntt/kernels/ct_mixed_radix_narrow.cu +++ b/ntt/kernels/ct_mixed_radix_narrow.cu @@ -100,7 +100,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, for (int z = 0; z < z_count; z++) { fr_t t = fr_t::csel(r[1][z], r[0][z], pos); - shfl_bfly(t, laneMask); + t.shfl_bfly(laneMask); r[0][z] = fr_t::csel(t, r[0][z], !pos); r[1][z] = fr_t::csel(t, r[1][z], pos); diff --git a/ntt/kernels/ct_mixed_radix_wide.cu b/ntt/kernels/ct_mixed_radix_wide.cu index 550eade..eec4a61 100644 --- a/ntt/kernels/ct_mixed_radix_wide.cu +++ b/ntt/kernels/ct_mixed_radix_wide.cu @@ -80,7 +80,7 @@ void _CT_NTT(const unsigned int radix, const unsigned int lg_domain_size, #ifdef __CUDA_ARCH__ fr_t x = fr_t::csel(r1, r0, pos); - shfl_bfly(x, laneMask); + x.shfl_bfly(laneMask); r0 = fr_t::csel(x, r0, !pos); r1 = fr_t::csel(x, r1, pos); #endif diff --git a/ntt/kernels/gs_mixed_radix_narrow.cu b/ntt/kernels/gs_mixed_radix_narrow.cu index b8added..a9f5f0a 100644 --- a/ntt/kernels/gs_mixed_radix_narrow.cu +++ b/ntt/kernels/gs_mixed_radix_narrow.cu @@ -104,7 +104,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, #ifdef __CUDA_ARCH__ t = fr_t::csel(r[1][z], r[0][z], pos); - shfl_bfly(t, laneMask); + t.shfl_bfly(laneMask); r[0][z] = fr_t::csel(t, r[0][z], !pos); r[1][z] = fr_t::csel(t, r[1][z], pos); diff --git a/ntt/kernels/gs_mixed_radix_wide.cu b/ntt/kernels/gs_mixed_radix_wide.cu index 7487e5d..cd64da8 100644 --- a/ntt/kernels/gs_mixed_radix_wide.cu +++ b/ntt/kernels/gs_mixed_radix_wide.cu @@ -70,7 +70,7 @@ void _GS_NTT(const unsigned int radix, const unsigned int lg_domain_size, bool pos = rank < laneMask; #ifdef __CUDA_ARCH__ t = fr_t::csel(r1, r0, pos); - shfl_bfly(t, laneMask); + t.shfl_bfly(laneMask); r0 = fr_t::csel(t, r0, !pos); r1 = fr_t::csel(t, r1, pos); #endif From 370916e5abb65be8cd3555dc70d624771ff47f3f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 7 Jan 2024 21:52:54 +0100 Subject: [PATCH 17/20] ff/mont_t.cuh: fix unary operator-(). --- ff/mont_t.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ff/mont_t.cuh b/ff/mont_t.cuh index e4aad31..9ba34d6 100644 --- a/ff/mont_t.cuh +++ b/ff/mont_t.cuh @@ -311,10 +311,10 @@ public: asm("}"); return *this; } - friend inline mont_t cneg(mont_t a, bool flag) + static inline mont_t cneg(mont_t a, bool flag) { return a.cneg(flag); } #else - friend inline mont_t cneg(const mont_t& a, bool flag) + static inline mont_t cneg(const mont_t& a, bool flag) { size_t i; uint32_t tmp[n], is_zero = a[0]; @@ -335,6 +335,8 @@ public: asm("}"); return ret; } + inline mont_t& cneg(bool flag) + { return *this = cneg(*this, flag); } #endif inline mont_t operator-() const { return cneg(*this, true); } From 65739f435088ff6d28f2e4baeb8404b1e4ad7836 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 7 Jan 2024 21:57:23 +0100 Subject: [PATCH 18/20] ff/{bb31,gl64}_t.cuh: harmonize unary operator-() implementations. --- ff/bb31_t.cuh | 4 ++-- ff/gl64_t.cuh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ff/bb31_t.cuh b/ff/bb31_t.cuh index 2bb3250..fb2a04f 100644 --- a/ff/bb31_t.cuh +++ b/ff/bb31_t.cuh @@ -104,10 +104,10 @@ public: return *this; } - friend inline bb31_t cneg(bb31_t a, bool flag) + static inline bb31_t cneg(bb31_t a, bool flag) { return a.cneg(flag); } inline bb31_t operator-() const - { bb31_t ret = *this; return ret.cneg(true); } + { return cneg(*this, true); } static inline const bb31_t one() { return bb31_t{ONE}; } inline bool is_one() const { return val == ONE; } diff --git a/ff/gl64_t.cuh b/ff/gl64_t.cuh index 9ea17d7..eb7164d 100644 --- a/ff/gl64_t.cuh +++ b/ff/gl64_t.cuh @@ -190,10 +190,10 @@ public: return *this; } - friend inline gl64_t cneg(gl64_t a, bool flag) + static inline gl64_t cneg(gl64_t a, bool flag) { return a.cneg(flag); } inline gl64_t operator-() const - { gl64_t ret = *this; return ret.cneg(true); } + { return cneg(*this, true); } static inline const gl64_t one() { gl64_t ret; ret.val = 1; return ret; } From a24b42bfd37f740d4c0ecaa8d54b2eef5c3245a3 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 7 Jan 2024 22:03:59 +0100 Subject: [PATCH 19/20] rust/src/lib.rs: add NTT enums declarations. --- rust/src/lib.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index a83f5ad..f047ff0 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -93,3 +93,23 @@ impl Clone for Gpu_Ptr { unsafe { transmute::<_, _>(clone_gpu_ptr_t(transmute::<&_, &_>(self))) } } } + +#[repr(C)] +pub enum NTTInputOutputOrder { + NN = 0, + NR = 1, + RN = 2, + RR = 3, +} + +#[repr(C)] +pub enum NTTDirection { + Forward = 0, + Inverse = 1, +} + +#[repr(C)] +pub enum NTTType { + Standard = 0, + Coset = 1, +} From ada2ea2103fac3ae6b40a700acafb1f3c9cf6ed8 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 7 Jan 2024 22:05:23 +0100 Subject: [PATCH 20/20] poc/ntt-cuda: modernize. --- poc/ntt-cuda/src/lib.rs | 24 ++---------------------- poc/ntt-cuda/tests/ntt.rs | 2 +- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/poc/ntt-cuda/src/lib.rs b/poc/ntt-cuda/src/lib.rs index 6847b4e..92162a1 100644 --- a/poc/ntt-cuda/src/lib.rs +++ b/poc/ntt-cuda/src/lib.rs @@ -2,27 +2,7 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -sppark::cuda_error!(); - -#[repr(C)] -pub enum NTTInputOutputOrder { - NN = 0, - NR = 1, - RN = 2, - RR = 3, -} - -#[repr(C)] -enum NTTDirection { - Forward = 0, - Inverse = 1, -} - -#[repr(C)] -enum NTTType { - Standard = 0, - Coset = 1, -} +use sppark::{NTTInputOutputOrder, NTTDirection, NTTType}; extern "C" { fn compute_ntt( @@ -32,7 +12,7 @@ extern "C" { ntt_order: NTTInputOutputOrder, ntt_direction: NTTDirection, ntt_type: NTTType, - ) -> cuda::Error; + ) -> sppark::Error; } /// Compute an in-place NTT on the input data. diff --git a/poc/ntt-cuda/tests/ntt.rs b/poc/ntt-cuda/tests/ntt.rs index 9f69a26..bb8c1ba 100644 --- a/poc/ntt-cuda/tests/ntt.rs +++ b/poc/ntt-cuda/tests/ntt.rs @@ -2,7 +2,7 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -use ntt_cuda::NTTInputOutputOrder; +use sppark::NTTInputOutputOrder; const DEFAULT_GPU: usize = 0;