diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 717948ec..c280d555 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,8 @@ name: test -on: [push] +on: + push: + branches: + - '*' jobs: build: @@ -8,8 +11,8 @@ jobs: steps: - uses: actions/checkout@v4 - run: lscpu - - run: sudo apt update - - run: sudo apt install valgrind nasm + - run: apt -y update + - run: apt -y install valgrind nasm - run: make bin/pairing.exe -j4 - run: valgrind bin/pairing.exe - run: make clean diff --git a/Makefile b/Makefile index feaeb0fe..33569d3f 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ ifeq ($(MCL_STATIC_CODE),1) endif ifeq ($(CPU),x86-64) MCL_USE_XBYAK?=1 - TEST_SRC+=mont_fp_test.cpp sq_test.cpp + TEST_SRC+=mont_fp_test.cpp #sq_test.cpp ifeq ($(MCL_USE_XBYAK),1) TEST_SRC+=fp_generator_test.cpp endif diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp index d1f9f9a0..2e1395b5 100644 --- a/include/mcl/fp.hpp +++ b/include/mcl/fp.hpp @@ -207,15 +207,7 @@ class FpT : public fp::Serializable, } static inline bool squareRoot(FpT& y, const FpT& x) { - if (isMont()) return op_.sq.get(y, x); - mpz_class mx, my; - bool b = false; - x.getMpz(&b, mx); - if (!b) return false; - b = op_.sq.get(my, mx); - if (!b) return false; - y.setMpz(&b, my); - return b; + return op_.sq.get(y, x); } FpT() {} FpT(const FpT& x) diff --git a/include/mcl/gmp_util.hpp b/include/mcl/gmp_util.hpp index b0e731fc..d58134b1 100644 --- a/include/mcl/gmp_util.hpp +++ b/include/mcl/gmp_util.hpp @@ -784,6 +784,61 @@ class SquareRoot { } return false; } + /* + solve x^2 = a in Fp + */ + template + bool getCandidate(Fp& x, const Fp& a) const + { + assert(Fp::getOp().mp == p); + if (a.isZero() || a.isOne()) { + x = a; + return true; + } + if (r == 1) { + // (p + 1) / 4 = (q + 1) / 2 + Fp::pow(x, a, q_add_1_div_2); + return true; + } + Fp c, d; + { + bool b; + c.setMpz(&b, s); + assert(b); + } + int e = r; + Fp::pow(d, a, q); + Fp::pow(x, a, q_add_1_div_2); // destroy a if &x == &a + Fp dd; + Fp b; + while (!d.isOne()) { + int i = 1; + Fp::sqr(dd, d); + while (!dd.isOne()) { + Fp::sqr(dd, dd); + i++; + if (i >= e) return false; + } + assert(e > i); + int t = e - i - 1; + const int tMax = 30; // int32_t max + if (t < tMax) { + b = 1 << t; + } else { + b = 1 << tMax; + t -= tMax; + for (int j = 0; j < t; j++) { + b += b; + } + } + Fp::pow(b, c, b); + x *= b; + Fp::sqr(c, b); + d *= c; + e = i; + } + return true; + } public: SquareRoot() { clear(); } bool isPrecomputed() const { return isPrecomputed_; } @@ -838,100 +893,18 @@ class SquareRoot { q_add_1_div_2 = (q + 1) / 2; *pb = true; } - /* - solve x^2 = a mod p - */ - bool get(mpz_class& x, const mpz_class& a) const - { - if (!isPrime) { - return false; - } - if (a == 0) { - x = 0; - return true; - } - if (gmp::legendre(a, p) < 0) return false; - if (r == 1) { - // (p + 1) / 4 = (q + 1) / 2 - gmp::powMod(x, a, q_add_1_div_2, p); - return true; - } - mpz_class c = s, d; - int e = r; - gmp::powMod(d, a, q, p); - gmp::powMod(x, a, q_add_1_div_2, p); // destroy a if &x == &a - mpz_class dd; - mpz_class b; - while (d != 1) { - int i = 1; - dd = d * d; dd %= p; - while (dd != 1) { - dd *= dd; dd %= p; - i++; - } - b = 1; - b <<= e - i - 1; - gmp::powMod(b, c, b, p); - x *= b; x %= p; - c = b * b; c %= p; - d *= c; d %= p; - e = i; - } - return true; - } - /* - solve x^2 = a in Fp - */ - template - bool get(Fp& x, const Fp& a) const + template + bool get(T& x, const T& a) const { - assert(Fp::getOp().mp == p); - if (a == 0) { - x = 0; - return true; - } - { - bool b; - mpz_class aa; - a.getMpz(&b, aa); - assert(b); - if (gmp::legendre(aa, p) < 0) return false; - } - if (r == 1) { - // (p + 1) / 4 = (q + 1) / 2 - Fp::pow(x, a, q_add_1_div_2); - return true; - } - Fp c, d; - { - bool b; - c.setMpz(&b, s); - assert(b); - } - int e = r; - Fp::pow(d, a, q); - Fp::pow(x, a, q_add_1_div_2); // destroy a if &x == &a - Fp dd; - Fp b; - while (!d.isOne()) { - int i = 1; - Fp::sqr(dd, d); - while (!dd.isOne()) { - dd *= dd; - i++; - } - b = 1; -// b <<= e - i - 1; - for (int j = 0; j < e - i - 1; j++) { - b += b; + T t, t2; + if (getCandidate(t, a)) { + T::sqr(t2, t); + if (t2 == a) { + x = t; + return true; } - Fp::pow(b, c, b); - x *= b; - Fp::sqr(c, b); - d *= c; - e = i; } - return true; + return false; } bool operator==(const SquareRoot& rhs) const { diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index c2eef7d8..d9a013fd 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -29,7 +29,7 @@ namespace mcl { -static const int version = 0x210; /* 0xABC = A.BC */ +static const int version = 0x211; /* 0xABC = A.BC */ /* specifies available string format mode for X::setIoMode() diff --git a/include/mcl/vint.hpp b/include/mcl/vint.hpp index e26e0b3a..347aaf01 100644 --- a/include/mcl/vint.hpp +++ b/include/mcl/vint.hpp @@ -696,6 +696,9 @@ class Vint { // logical left shift (copy sign) static void shl(Vint& y, const Vint& x, size_t shiftBit) { +if (shiftBit > MCL_MAX_BIT_SIZE*2) { + printf("shiftBit=%zd\n", shiftBit); +} assert(shiftBit <= MCL_MAX_BIT_SIZE * 2); // many be too big size_t xn = x.size(); size_t yn = xn + (shiftBit + UnitBitSize - 1) / UnitBitSize; diff --git a/src/bint_impl.hpp b/src/bint_impl.hpp index 10c07100..5f270455 100644 --- a/src/bint_impl.hpp +++ b/src/bint_impl.hpp @@ -46,23 +46,11 @@ struct UnrollMulLowT<1, 1> { } // impl #if MCL_BINT_ASM != 1 -#ifdef MCL_WASM32 -inline uint64_t load8byte(const uint32_t *x) -{ - return x[0] | (uint64_t(x[1]) << 32); -} -inline void store8byte(uint32_t *x, uint64_t v) -{ - x[0] = uint32_t(v); - x[1] = uint32_t(v >> 32); -} -#endif template Unit addT(Unit *z, const Unit *x, const Unit *y) { -#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4 +#ifdef MCL_WASM32 // wasm32 supports 64-bit add -#if 1 uint64_t c = 0; for (size_t i = 0; i < N; i++) { uint64_t v = uint64_t(x[i]) + y[i] + c; @@ -70,26 +58,6 @@ Unit addT(Unit *z, const Unit *x, const Unit *y) c = v >> 32; } return uint32_t(c); -#else - uint32_t c = 0; - for (size_t i = 0; i < N - 1; i += 2) { - uint64_t xc = load8byte(x + i) + c; - c = xc < c; - uint64_t yi = load8byte(y + i); - xc += yi; - c += xc < yi; - store8byte(z + i, xc); - } - if (N & 1) { - uint32_t xc = x[N - 1] + c; - c = xc < c; - uint32_t yi = y[N - 1]; - xc += yi; - c += xc < yi; - z[N - 1] = xc; - } - return c; -#endif #else Unit c = 0; for (size_t i = 0; i < N; i++) { @@ -107,9 +75,8 @@ Unit addT(Unit *z, const Unit *x, const Unit *y) template Unit subT(Unit *z, const T *x, const Unit *y) { -#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4 +#ifdef MCL_WASM32 // wasm32 supports 64-bit sub -#if 1 uint64_t c = 0; for (size_t i = 0; i < N; i++) { uint64_t v = uint64_t(x[i]) - y[i] - c; @@ -117,26 +84,6 @@ Unit subT(Unit *z, const T *x, const Unit *y) c = v >> 63; } return c; -#else - uint32_t c = 0; - for (size_t i = 0; i < N - 1; i += 2) { - uint64_t yi = load8byte(y + i); - yi += c; - c = yi < c; - uint64_t xi = load8byte(x + i); - c += xi < yi; - store8byte(z + i, xi - yi); - } - if (N & 1) { - uint32_t yi = y[N - 1]; - yi += c; - c = yi < c; - uint32_t xi = x[N - 1]; - c += xi < yi; - z[N - 1] = xi - yi; - } - return c; -#endif #else Unit c = 0; for (size_t i = 0; i < N; i++) { @@ -203,7 +150,7 @@ Unit mulUnitT(T *z, const Unit *x, Unit y) template Unit mulUnitAddT(T *z, const Unit *x, Unit y) { -#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4 +#ifdef MCL_WASM32 uint64_t y_ = y; uint64_t v = z[0] + x[0] * y_; z[0] = uint32_t(v); diff --git a/src/low_func.hpp b/src/low_func.hpp index 6cd44081..067a8d3c 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -9,6 +9,10 @@ #include #include #include +//#ifdef MCL_WASM32 +// compiler option : -msimd128 +// #include +//#endif #ifdef _MSC_VER #pragma warning(push) @@ -147,7 +151,7 @@ template static void modRedT(Unit *z, const Unit *xy, const Unit *p) { const Unit rp = p[-1]; -#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4 +#ifdef MCL_WASM32 uint64_t buf[N * 2]; #else Unit buf[N * 2]; @@ -232,6 +236,155 @@ static void mulMontT(Unit *z, const Unit *x, const Unit *y, const Unit *p) } } +#if 0 // #ifdef MCL_WASM32 +template +uint64_t mulUnitLazyT(uint64_t *z, const Unit *x, Unit y) +{ +#if 1 + uint64_t y_ = y; + uint64_t H = x[0] * y_; + z[0] = uint32_t(H); + uint64_t Ls[N-1], Hs[N-1]; + Hs[0] = H >> 32; + for (size_t i = 1; i < N - 1; i++) { + H = x[i] * y_; + Ls[i-1] = uint32_t(H); + Hs[i] = H >> 32; + } + H = x[N-1] * y_; + Ls[N-2] = uint32_t(H); +#if 1 + for (size_t i = 0; i < ((N - 1) & ~1); i += 2) { +// z[i+1] = Hs[i] + Ls[i]; + v128_t vH = wasm_v128_load(&Hs[i]); + v128_t vL = wasm_v128_load(&Ls[i]); + wasm_v128_store(&z[i+1], wasm_i64x2_add(vH, vL)); + } + if ((N & 1) == 0) { + z[N - 1] = Hs[N - 2] + Ls[N - 2]; + } +#else + for (size_t i = 0; i < N - 1; i++) { + z[i+1] = Hs[i] + Ls[i]; + } +#endif + return H >> 32; +#else + uint64_t y_ = y; + uint64_t H = x[0] * y_; + z[0] = uint32_t(H); + for (size_t i = 1; i < N; i++) { + uint64_t v = x[i] * y_; + z[i] = uint32_t(v) + (H >> 32); + H = v; + } + return H >> 32; +#endif +} + +template +uint64_t mulUnitAddLazyT(uint64_t *z, const Unit *x, Unit y) +{ +#if 1 + uint64_t y_ = y; + uint64_t H = x[0] * y_ + z[0]; + z[0] = uint32_t(H); + uint64_t Ls[N-1], Hs[N-1]; + Hs[0] = H >> 32; + for (size_t i = 1; i < N - 1; i++) { + H = x[i] * y_; + Ls[i-1] = uint32_t(H); + Hs[i] = H >> 32; + } + H = x[N-1] * y_; + Ls[N-2] = uint32_t(H); +#if 1 + for (size_t i = 0; i < ((N - 1) & ~1); i += 2) { + v128_t t = wasm_v128_load(&z[i+1]); + t = wasm_i64x2_add(t, wasm_v128_load(&Hs[i])); + t = wasm_i64x2_add(t, wasm_v128_load(&Ls[i])); + wasm_v128_store(&z[i+1], t); + } + if ((N & 1) == 0) { + z[N - 1] += Hs[N - 2] + Ls[N - 2]; + } +#else + for (size_t i = 0; i < N - 1; i++) { + z[i+1] += Hs[i] + Ls[i]; + } +#endif + return H >> 32; +#else + uint64_t y_ = y; + uint64_t H = z[0] + x[0] * y_; + z[0] = uint32_t(H); + for (size_t i = 1; i < N; i++) { + uint64_t v = x[i] * y_; + z[i] = z[i] + uint32_t(v) + (H >> 32); + H = v; + } + return H >> 32; +#endif +} +#endif + +template +uint64_t mulUnitLazyT(uint64_t *z, const Unit *x, Unit y) +{ + uint64_t y_ = y; + uint64_t H = x[0] * y_; + z[0] = uint32_t(H); + for (size_t i = 1; i < N; i++) { + uint64_t v = x[i] * y_; + z[i] = uint32_t(v) + (H >> 32); + H = v; + } + return H >> 32; +} + +template +uint64_t mulUnitAddLazyT(uint64_t *z, const Unit *x, Unit y) +{ + uint64_t y_ = y; + uint64_t H = z[0] + x[0] * y_; + z[0] = uint32_t(H); + for (size_t i = 1; i < N; i++) { + uint64_t v = x[i] * y_; + z[i] = z[i] + uint32_t(v) + (H >> 32); + H = v; + } + return H >> 32; +} + +/* + r = bint::mulUnitAddT(z, x, *y); + q = z[0] * rp; + r += bint::mulUnitAddT(z, p, q); + return r; +*/ +template +uint64_t mulUnitAddLazy2T(uint64_t *z, const Unit *x, Unit y, const Unit *p, Unit rp) +{ + uint64_t y_ = y; + uint64_t H1 = z[0] + x[0] * y_; + uint64_t t = uint32_t(H1); + uint64_t q = uint32_t(t * rp); + uint64_t H2 = t + p[0] * q; + assert(uint32_t(H2) == 0); +// z[0] = uint32_t(H2); // set is unnecessary because it must be zero + for (size_t i = 1; i < N; i++) { + t = z[i]; + t += H1 >> 32; + H1 = x[i] * y_; + t += uint32_t(H1); + t += H2 >> 32; + H2 = p[i] * q; + t += uint32_t(H2); + z[i] = t; + } + return (H1 >> 32) + (H2 >> 32); +} + template static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p) { @@ -247,7 +400,32 @@ static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p) t >> 64 <= (F - 2)(R - 1)/R = (F - 2) - (F - 2)/R t + (t >> 64) = (F - 2)R - (F - 2)/R < FR */ -#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4 +#if 0 // #ifdef MCL_WASM32 + // use uint64_t if Unit = uint32_t to reduce conversion + uint64_t buf[N * 2]; + buf[N] = mulUnitLazyT(buf, x, y[0]); + Unit q = buf[0] * rp; + buf[N] += mulUnitAddLazyT(buf, p, q); + for (size_t i = 1; i < N; i++) { +#if 1 + buf[N + i] = mulUnitAddLazy2T(buf + i, x, y[i], p, rp); +#else + buf[N + i] = mulUnitAddLazyT(buf + i, x, y[i]); + q = buf[i] * rp; + buf[N + i] += mulUnitAddLazyT(buf + i, p, q); +#endif + } + uint64_t H = 0; + for (size_t i = N; i < N*2; i++) { + uint64_t v = buf[i] + (H >> 32); + buf[i] = uint32_t(v); + H = v; + } + if (bint::subT(z, buf + N, p)) { + bint::copyT(z, buf + N); + } +#else +#ifdef MCL_WASM32 // use uint64_t if Unit = uint32_t to reduce conversion uint64_t buf[N * 2]; #else @@ -264,6 +442,7 @@ static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p) if (bint::subT(z, buf + N, p)) { bint::copyT(z, buf + N); } +#endif } // z[N] <- Montgomery(x[N], x[N], p[N]) diff --git a/test/bench.hpp b/test/bench.hpp index 6b3193a4..95f9b06a 100644 --- a/test/bench.hpp +++ b/test/bench.hpp @@ -88,6 +88,20 @@ void invVecBench(const char *msg) CYBOZU_BENCH_C(msg, C, mcl::invVec, x, x, n); } +template +void sqrBench(const T& x, const char *msg) +{ + T y = x * x, z; + CYBOZU_BENCH_C((std::string(msg) + "::squareRoot T").c_str(), 10000, T::squareRoot, z, y); + for (int i = 0; i < 100; i++) { + if (!T::squareRoot(z, y)) { + break; + } + y += T::one(); + } + CYBOZU_BENCH_C((std::string(msg) + "::squareRoot F").c_str(), 10000, T::squareRoot, z, y); +} + void testBench(const G1& P, const G2& Q) { #ifndef NDEBUG @@ -151,6 +165,7 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x); CYBOZU_BENCH_C("Fp::inv ", C3, invAdd, x, x, y); CYBOZU_BENCH_C("Fp::pow ", C3, Fp::pow, x, x, y); + sqrBench(x, "Fp"); invVecBench("Fp:invVec"); invVecBench("Fr:invVec"); { @@ -168,6 +183,7 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("Fr::sqr ", C3, Fr::sqr, a, a); CYBOZU_BENCH_C("Fr::inv ", C3, invAdd, a, a, b); CYBOZU_BENCH_C("Fr::pow ", C3, Fr::pow, a, a, b); + sqrBench(a, "Fr"); } Fp2 xx, yy; xx.a = x; diff --git a/test/ec_test.cpp b/test/ec_test.cpp index dd11ba61..39fccb66 100644 --- a/test/ec_test.cpp +++ b/test/ec_test.cpp @@ -385,19 +385,37 @@ struct Test { } void squareRoot() const { + Ec P(Fp(para.gx), Fp(para.gy)); + + for (int i = 0; i < 100; i++) { + Ec::dbl(P, P); + P.normalize(); + Fp x = P.x; + Fp y = P.y; + Fp yy; + CYBOZU_TEST_ASSERT(Ec::getYfromX(yy, x, y.isOdd())); + CYBOZU_TEST_EQUAL(yy, y); + Fp::neg(y, y); + yy.clear(); + CYBOZU_TEST_ASSERT(Ec::getYfromX(yy, x, y.isOdd())); + CYBOZU_TEST_EQUAL(yy, y); + yy += P.y; + CYBOZU_TEST_ASSERT(yy.isZero()); + } + Fp x(para.gx); - Fp y(para.gy); - bool odd = y.isOdd(); - Fp yy; - bool b = Ec::getYfromX(yy, x, odd); - CYBOZU_TEST_ASSERT(b); - CYBOZU_TEST_EQUAL(yy, y); - Fp::neg(y, y); - odd = y.isOdd(); - yy.clear(); - b = Ec::getYfromX(yy, x, odd); - CYBOZU_TEST_ASSERT(b); - CYBOZU_TEST_EQUAL(yy, y); + for (int i = 0; i < 100; i++) { + mpz_class mx = x.getMpz(); + int ret = mcl::gmp::legendre(mx, Fp::getOp().mp); + Fp y; + if (Fp::squareRoot(y, x)) { + CYBOZU_TEST_EQUAL(y*y, x); + CYBOZU_TEST_EQUAL(ret, 1); + } else { + CYBOZU_TEST_EQUAL(ret, -1); + } + x += 1; + } } void mul_fp() const { diff --git a/test/sq_test.cpp b/test/sq_test.cpp index 4c386d23..0ab24c77 100644 --- a/test/sq_test.cpp +++ b/test/sq_test.cpp @@ -11,10 +11,15 @@ CYBOZU_TEST_AUTO(sqrt) sq.set(p); for (mpz_class a = 0; a < p; a++) { mpz_class x; - if (sq.get(x, a)) { + bool b1 = sq.get(x, a); + mpz_class x2; + bool b2 = sq.get2(x2, a); + CYBOZU_TEST_EQUAL(b1, b2); + if (b1) { mpz_class y; y = (x * x) % p; CYBOZU_TEST_EQUAL(a, y); + CYBOZU_TEST_EQUAL(x, x2); } } }