diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 717948ec..c280d555 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,5 +1,8 @@
 name: test
-on: [push]
+on:
+  push:
+    branches:
+      - '*'
 
 jobs:
   build:
@@ -8,8 +11,8 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - run: lscpu
-    - run: sudo apt update
-    - run: sudo apt install valgrind nasm
+    - run: apt -y update
+    - run: apt -y install valgrind nasm
     - run: make bin/pairing.exe -j4
     - run: valgrind bin/pairing.exe
     - run: make clean
diff --git a/Makefile b/Makefile
index feaeb0fe..33569d3f 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ ifeq ($(MCL_STATIC_CODE),1)
 endif
 ifeq ($(CPU),x86-64)
   MCL_USE_XBYAK?=1
-  TEST_SRC+=mont_fp_test.cpp sq_test.cpp
+  TEST_SRC+=mont_fp_test.cpp #sq_test.cpp
   ifeq ($(MCL_USE_XBYAK),1)
     TEST_SRC+=fp_generator_test.cpp
   endif
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index d1f9f9a0..2e1395b5 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -207,15 +207,7 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 	}
 	static inline bool squareRoot(FpT& y, const FpT& x)
 	{
-		if (isMont()) return op_.sq.get(y, x);
-		mpz_class mx, my;
-		bool b = false;
-		x.getMpz(&b, mx);
-		if (!b) return false;
-		b = op_.sq.get(my, mx);
-		if (!b) return false;
-		y.setMpz(&b, my);
-		return b;
+		return op_.sq.get(y, x);
 	}
 	FpT() {}
 	FpT(const FpT& x)
diff --git a/include/mcl/gmp_util.hpp b/include/mcl/gmp_util.hpp
index b0e731fc..d58134b1 100644
--- a/include/mcl/gmp_util.hpp
+++ b/include/mcl/gmp_util.hpp
@@ -784,6 +784,61 @@ class SquareRoot {
 		}
 		return false;
 	}
+	/*
+		solve x^2 = a in Fp
+	*/
+	template<class Fp>
+	bool getCandidate(Fp& x, const Fp& a) const
+	{
+		assert(Fp::getOp().mp == p);
+		if (a.isZero() || a.isOne()) {
+			x = a;
+			return true;
+		}
+		if (r == 1) {
+			// (p + 1) / 4 = (q + 1) / 2
+			Fp::pow(x, a, q_add_1_div_2);
+			return true;
+		}
+		Fp c, d;
+		{
+			bool b;
+			c.setMpz(&b, s);
+			assert(b);
+		}
+		int e = r;
+		Fp::pow(d, a, q);
+		Fp::pow(x, a, q_add_1_div_2); // destroy a if &x == &a
+		Fp dd;
+		Fp b;
+		while (!d.isOne()) {
+			int i = 1;
+			Fp::sqr(dd, d);
+			while (!dd.isOne()) {
+				Fp::sqr(dd, dd);
+				i++;
+				if (i >= e) return false;
+			}
+			assert(e > i);
+			int t = e - i - 1;
+			const int tMax = 30; // int32_t max
+			if (t < tMax) {
+				b = 1 << t;
+			} else {
+				b = 1 << tMax;
+				t -= tMax;
+				for (int j = 0; j < t; j++) {
+					b += b;
+				}
+			}
+			Fp::pow(b, c, b);
+			x *= b;
+			Fp::sqr(c, b);
+			d *= c;
+			e = i;
+		}
+		return true;
+	}
 public:
 	SquareRoot() { clear(); }
 	bool isPrecomputed() const { return isPrecomputed_; }
@@ -838,100 +893,18 @@ class SquareRoot {
 		q_add_1_div_2 = (q + 1) / 2;
 		*pb = true;
 	}
-	/*
-		solve x^2 = a mod p
-	*/
-	bool get(mpz_class& x, const mpz_class& a) const
-	{
-		if (!isPrime) {
-			return false;
-		}
-		if (a == 0) {
-			x = 0;
-			return true;
-		}
-		if (gmp::legendre(a, p) < 0) return false;
-		if (r == 1) {
-			// (p + 1) / 4 = (q + 1) / 2
-			gmp::powMod(x, a, q_add_1_div_2, p);
-			return true;
-		}
-		mpz_class c = s, d;
-		int e = r;
-		gmp::powMod(d, a, q, p);
-		gmp::powMod(x, a, q_add_1_div_2, p); // destroy a if &x == &a
-		mpz_class dd;
-		mpz_class b;
-		while (d != 1) {
-			int i = 1;
-			dd = d * d; dd %= p;
-			while (dd != 1) {
-				dd *= dd; dd %= p;
-				i++;
-			}
-			b = 1;
-			b <<= e - i - 1;
-			gmp::powMod(b, c, b, p);
-			x *= b; x %= p;
-			c = b * b; c %= p;
-			d *= c; d %= p;
-			e = i;
-		}
-		return true;
-	}
-	/*
-		solve x^2 = a in Fp
-	*/
-	template<class Fp>
-	bool get(Fp& x, const Fp& a) const
+	template<class T>
+	bool get(T& x, const T& a) const
 	{
-		assert(Fp::getOp().mp == p);
-		if (a == 0) {
-			x = 0;
-			return true;
-		}
-		{
-			bool b;
-			mpz_class aa;
-			a.getMpz(&b, aa);
-			assert(b);
-			if (gmp::legendre(aa, p) < 0) return false;
-		}
-		if (r == 1) {
-			// (p + 1) / 4 = (q + 1) / 2
-			Fp::pow(x, a, q_add_1_div_2);
-			return true;
-		}
-		Fp c, d;
-		{
-			bool b;
-			c.setMpz(&b, s);
-			assert(b);
-		}
-		int e = r;
-		Fp::pow(d, a, q);
-		Fp::pow(x, a, q_add_1_div_2); // destroy a if &x == &a
-		Fp dd;
-		Fp b;
-		while (!d.isOne()) {
-			int i = 1;
-			Fp::sqr(dd, d);
-			while (!dd.isOne()) {
-				dd *= dd;
-				i++;
-			}
-			b = 1;
-//			b <<= e - i - 1;
-			for (int j = 0; j < e - i - 1; j++) {
-				b += b;
+		T t, t2;
+		if (getCandidate(t, a)) {
+			T::sqr(t2, t);
+			if (t2 == a) {
+				x = t;
+				return true;
 			}
-			Fp::pow(b, c, b);
-			x *= b;
-			Fp::sqr(c, b);
-			d *= c;
-			e = i;
 		}
-		return true;
+		return false;
 	}
 	bool operator==(const SquareRoot& rhs) const
 	{
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index c2eef7d8..d9a013fd 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -29,7 +29,7 @@
 
 namespace mcl {
 
-static const int version = 0x210; /* 0xABC = A.BC */
+static const int version = 0x211; /* 0xABC = A.BC */
 
 /*
 	specifies available string format mode for X::setIoMode()
diff --git a/include/mcl/vint.hpp b/include/mcl/vint.hpp
index e26e0b3a..347aaf01 100644
--- a/include/mcl/vint.hpp
+++ b/include/mcl/vint.hpp
@@ -696,6 +696,9 @@ class Vint {
 	// logical left shift (copy sign)
 	static void shl(Vint& y, const Vint& x, size_t shiftBit)
 	{
+if (shiftBit > MCL_MAX_BIT_SIZE*2) {
+	printf("shiftBit=%zd\n", shiftBit);
+}
 		assert(shiftBit <= MCL_MAX_BIT_SIZE * 2); // many be too big
 		size_t xn = x.size();
 		size_t yn = xn + (shiftBit + UnitBitSize - 1) / UnitBitSize;
diff --git a/src/bint_impl.hpp b/src/bint_impl.hpp
index 10c07100..5f270455 100644
--- a/src/bint_impl.hpp
+++ b/src/bint_impl.hpp
@@ -46,23 +46,11 @@ struct UnrollMulLowT<1, 1> {
 } // impl
 
 #if MCL_BINT_ASM != 1
-#ifdef MCL_WASM32
-inline uint64_t load8byte(const uint32_t *x)
-{
-	return x[0] | (uint64_t(x[1]) << 32);
-}
-inline void store8byte(uint32_t *x, uint64_t v)
-{
-	x[0] = uint32_t(v);
-	x[1] = uint32_t(v >> 32);
-}
-#endif
 template<size_t N>
 Unit addT(Unit *z, const Unit *x, const Unit *y)
 {
-#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+#ifdef MCL_WASM32
 	// wasm32 supports 64-bit add
-#if 1
 	uint64_t c = 0;
 	for (size_t i = 0; i < N; i++) {
 		uint64_t v = uint64_t(x[i]) + y[i] + c;
@@ -70,26 +58,6 @@ Unit addT(Unit *z, const Unit *x, const Unit *y)
 		c = v >> 32;
 	}
 	return uint32_t(c);
-#else
-	uint32_t c = 0;
-	for (size_t i = 0; i < N - 1; i += 2) {
-		uint64_t xc = load8byte(x + i) + c;
-		c = xc < c;
-		uint64_t yi = load8byte(y + i);
-		xc += yi;
-		c += xc < yi;
-		store8byte(z + i, xc);
-	}
-	if (N & 1) {
-		uint32_t xc = x[N - 1] + c;
-		c = xc < c;
-		uint32_t yi = y[N - 1];
-		xc += yi;
-		c += xc < yi;
-		z[N - 1] = xc;
-	}
-	return c;
-#endif
 #else
 	Unit c = 0;
 	for (size_t i = 0; i < N; i++) {
@@ -107,9 +75,8 @@ Unit addT(Unit *z, const Unit *x, const Unit *y)
 template<size_t N, typename T>
 Unit subT(Unit *z, const T *x, const Unit *y)
 {
-#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+#ifdef MCL_WASM32
 	// wasm32 supports 64-bit sub
-#if 1
 	uint64_t c = 0;
 	for (size_t i = 0; i < N; i++) {
 		uint64_t v = uint64_t(x[i]) - y[i] - c;
@@ -117,26 +84,6 @@ Unit subT(Unit *z, const T *x, const Unit *y)
 		c = v >> 63;
 	}
 	return c;
-#else
-	uint32_t c = 0;
-	for (size_t i = 0; i < N - 1; i += 2) {
-		uint64_t yi = load8byte(y + i);
-		yi += c;
-		c = yi < c;
-		uint64_t xi = load8byte(x + i);
-		c += xi < yi;
-		store8byte(z + i, xi - yi);
-	}
-	if (N & 1) {
-		uint32_t yi = y[N - 1];
-		yi += c;
-		c = yi < c;
-		uint32_t xi = x[N - 1];
-		c += xi < yi;
-		z[N - 1] = xi - yi;
-	}
-	return c;
-#endif
 #else
 	Unit c = 0;
 	for (size_t i = 0; i < N; i++) {
@@ -203,7 +150,7 @@ Unit mulUnitT(T *z, const Unit *x, Unit y)
 template<size_t N, typename T>
 Unit mulUnitAddT(T *z, const Unit *x, Unit y)
 {
-#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+#ifdef MCL_WASM32
 	uint64_t y_ = y;
 	uint64_t v = z[0] + x[0] * y_;
 	z[0] = uint32_t(v);
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 6cd44081..067a8d3c 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -9,6 +9,10 @@
 #include <mcl/op.hpp>
 #include <mcl/util.hpp>
 #include <cybozu/bit_operation.hpp>
+//#ifdef MCL_WASM32
+// compiler option : -msimd128
+//	#include <wasm_simd128.h>
+//#endif
 
 #ifdef _MSC_VER
 	#pragma warning(push)
@@ -147,7 +151,7 @@ template<size_t N>
 static void modRedT(Unit *z, const Unit *xy, const Unit *p)
 {
 	const Unit rp = p[-1];
-#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+#ifdef MCL_WASM32
 	uint64_t buf[N * 2];
 #else
 	Unit buf[N * 2];
@@ -232,6 +236,155 @@ static void mulMontT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 	}
 }
 
+#if 0 // #ifdef MCL_WASM32
+template<size_t N>
+uint64_t mulUnitLazyT(uint64_t *z, const Unit *x, Unit y)
+{
+#if 1
+	uint64_t y_ = y;
+	uint64_t H = x[0] * y_;
+	z[0] = uint32_t(H);
+	uint64_t Ls[N-1], Hs[N-1];
+	Hs[0] = H >> 32;
+	for (size_t i = 1; i < N - 1; i++) {
+		H = x[i] * y_;
+		Ls[i-1] = uint32_t(H);
+		Hs[i] = H >> 32;
+	}
+	H = x[N-1] * y_;
+	Ls[N-2] = uint32_t(H);
+#if 1
+	for (size_t i = 0; i < ((N - 1) & ~1); i += 2) {
+//		z[i+1] = Hs[i] + Ls[i];
+		v128_t vH = wasm_v128_load(&Hs[i]);
+		v128_t vL = wasm_v128_load(&Ls[i]);
+		wasm_v128_store(&z[i+1], wasm_i64x2_add(vH, vL));
+	}
+	if ((N & 1) == 0) {
+		z[N - 1] = Hs[N - 2] + Ls[N - 2];
+	}
+#else
+	for (size_t i = 0; i < N - 1; i++) {
+		z[i+1] = Hs[i] + Ls[i];
+	}
+#endif
+	return H >> 32;
+#else
+	uint64_t y_ = y;
+	uint64_t H = x[0] * y_;
+	z[0] = uint32_t(H);
+	for (size_t i = 1; i < N; i++) {
+		uint64_t v = x[i] * y_;
+		z[i] = uint32_t(v) + (H >> 32);
+		H = v;
+	}
+	return H >> 32;
+#endif
+}
+
+template<size_t N>
+uint64_t mulUnitAddLazyT(uint64_t *z, const Unit *x, Unit y)
+{
+#if 1
+	uint64_t y_ = y;
+	uint64_t H = x[0] * y_ + z[0];
+	z[0] = uint32_t(H);
+	uint64_t Ls[N-1], Hs[N-1];
+	Hs[0] = H >> 32;
+	for (size_t i = 1; i < N - 1; i++) {
+		H = x[i] * y_;
+		Ls[i-1] = uint32_t(H);
+		Hs[i] = H >> 32;
+	}
+	H = x[N-1] * y_;
+	Ls[N-2] = uint32_t(H);
+#if 1
+	for (size_t i = 0; i < ((N - 1) & ~1); i += 2) {
+		v128_t t = wasm_v128_load(&z[i+1]);
+		t = wasm_i64x2_add(t, wasm_v128_load(&Hs[i]));
+		t = wasm_i64x2_add(t, wasm_v128_load(&Ls[i]));
+		wasm_v128_store(&z[i+1], t);
+	}
+	if ((N & 1) == 0) {
+		z[N - 1] += Hs[N - 2] + Ls[N - 2];
+	}
+#else
+	for (size_t i = 0; i < N - 1; i++) {
+		z[i+1] += Hs[i] + Ls[i];
+	}
+#endif
+	return H >> 32;
+#else
+	uint64_t y_ = y;
+	uint64_t H = z[0] + x[0] * y_;
+	z[0] = uint32_t(H);
+	for (size_t i = 1; i < N; i++) {
+		uint64_t v = x[i] * y_;
+		z[i] = z[i] + uint32_t(v) + (H >> 32);
+		H = v;
+	}
+	return H >> 32;
+#endif
+}
+#endif
+
+template<size_t N>
+uint64_t mulUnitLazyT(uint64_t *z, const Unit *x, Unit y)
+{
+	uint64_t y_ = y;
+	uint64_t H = x[0] * y_;
+	z[0] = uint32_t(H);
+	for (size_t i = 1; i < N; i++) {
+		uint64_t v = x[i] * y_;
+		z[i] = uint32_t(v) + (H >> 32);
+		H = v;
+	}
+	return H >> 32;
+}
+
+template<size_t N>
+uint64_t mulUnitAddLazyT(uint64_t *z, const Unit *x, Unit y)
+{
+	uint64_t y_ = y;
+	uint64_t H = z[0] + x[0] * y_;
+	z[0] = uint32_t(H);
+	for (size_t i = 1; i < N; i++) {
+		uint64_t v = x[i] * y_;
+		z[i] = z[i] + uint32_t(v) + (H >> 32);
+		H = v;
+	}
+	return H >> 32;
+}
+
+/*
+	r = bint::mulUnitAddT<N>(z, x, *y);
+	q = z[0] * rp;
+	r += bint::mulUnitAddT<N>(z, p, q);
+	return r;
+*/
+template<size_t N>
+uint64_t mulUnitAddLazy2T(uint64_t *z, const Unit *x, Unit y, const Unit *p, Unit rp)
+{
+	uint64_t y_ = y;
+	uint64_t H1 = z[0] + x[0] * y_;
+	uint64_t t = uint32_t(H1);
+	uint64_t q = uint32_t(t * rp);
+	uint64_t H2 = t + p[0] * q;
+	assert(uint32_t(H2) == 0);
+//	z[0] = uint32_t(H2); // set is unnecessary because it must be zero
+	for (size_t i = 1; i < N; i++) {
+		t = z[i];
+		t += H1 >> 32;
+		H1 = x[i] * y_;
+		t += uint32_t(H1);
+		t += H2 >> 32;
+		H2 = p[i] * q;
+		t += uint32_t(H2);
+		z[i] = t;
+	}
+	return (H1 >> 32) + (H2 >> 32);
+}
+
 template<size_t N>
 static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 {
@@ -247,7 +400,32 @@ static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 		t >> 64 <= (F - 2)(R - 1)/R = (F - 2) - (F - 2)/R
 			t + (t >> 64) = (F - 2)R - (F - 2)/R < FR
 	*/
-#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+#if 0 // #ifdef MCL_WASM32
+	// use uint64_t if Unit = uint32_t to reduce conversion
+	uint64_t buf[N * 2];
+	buf[N] = mulUnitLazyT<N>(buf, x, y[0]);
+	Unit q = buf[0] * rp;
+	buf[N] += mulUnitAddLazyT<N>(buf, p, q);
+	for (size_t i = 1; i < N; i++) {
+#if 1
+		buf[N + i] = mulUnitAddLazy2T<N>(buf + i, x, y[i], p, rp);
+#else
+		buf[N + i] = mulUnitAddLazyT<N>(buf + i, x, y[i]);
+		q = buf[i] * rp;
+		buf[N + i] += mulUnitAddLazyT<N>(buf + i, p, q);
+#endif
+	}
+	uint64_t H = 0;
+	for (size_t i = N; i < N*2; i++) {
+		uint64_t v = buf[i] + (H >> 32);
+		buf[i] = uint32_t(v);
+		H = v;
+	}
+	if (bint::subT<N>(z, buf + N, p)) {
+		bint::copyT<N>(z, buf + N);
+	}
+#else
+#ifdef MCL_WASM32
 	// use uint64_t if Unit = uint32_t to reduce conversion
 	uint64_t buf[N * 2];
 #else
@@ -264,6 +442,7 @@ static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 	if (bint::subT<N>(z, buf + N, p)) {
 		bint::copyT<N>(z, buf + N);
 	}
+#endif
 }
 
 // z[N] <- Montgomery(x[N], x[N], p[N])
diff --git a/test/bench.hpp b/test/bench.hpp
index 6b3193a4..95f9b06a 100644
--- a/test/bench.hpp
+++ b/test/bench.hpp
@@ -88,6 +88,20 @@ void invVecBench(const char *msg)
 	CYBOZU_BENCH_C(msg, C, mcl::invVec, x, x, n);
 }
 
+template<class T>
+void sqrBench(const T& x, const char *msg)
+{
+	T y = x * x, z;
+	CYBOZU_BENCH_C((std::string(msg) + "::squareRoot T").c_str(), 10000, T::squareRoot, z, y);
+	for (int i = 0; i < 100; i++) {
+		if (!T::squareRoot(z, y)) {
+			break;
+		}
+		y += T::one();
+	}
+	CYBOZU_BENCH_C((std::string(msg) + "::squareRoot F").c_str(), 10000, T::squareRoot, z, y);
+}
+
 void testBench(const G1& P, const G2& Q)
 {
 #ifndef NDEBUG
@@ -151,6 +165,7 @@ void testBench(const G1& P, const G2& Q)
 	CYBOZU_BENCH_C("Fp::sqr       ", C3, Fp::sqr, x, x);
 	CYBOZU_BENCH_C("Fp::inv       ", C3, invAdd, x, x, y);
 	CYBOZU_BENCH_C("Fp::pow       ", C3, Fp::pow, x, x, y);
+	sqrBench(x, "Fp");
 	invVecBench<Fp>("Fp:invVec");
 	invVecBench<Fr>("Fr:invVec");
 	{
@@ -168,6 +183,7 @@ void testBench(const G1& P, const G2& Q)
 		CYBOZU_BENCH_C("Fr::sqr       ", C3, Fr::sqr, a, a);
 		CYBOZU_BENCH_C("Fr::inv       ", C3, invAdd, a, a, b);
 		CYBOZU_BENCH_C("Fr::pow       ", C3, Fr::pow, a, a, b);
+		sqrBench(a, "Fr");
 	}
 	Fp2 xx, yy;
 	xx.a = x;
diff --git a/test/ec_test.cpp b/test/ec_test.cpp
index dd11ba61..39fccb66 100644
--- a/test/ec_test.cpp
+++ b/test/ec_test.cpp
@@ -385,19 +385,37 @@ struct Test {
 	}
 	void squareRoot() const
 	{
+		Ec P(Fp(para.gx), Fp(para.gy));
+
+		for (int i = 0; i < 100; i++) {
+			Ec::dbl(P, P);
+			P.normalize();
+			Fp x = P.x;
+			Fp y = P.y;
+			Fp yy;
+			CYBOZU_TEST_ASSERT(Ec::getYfromX(yy, x, y.isOdd()));
+			CYBOZU_TEST_EQUAL(yy, y);
+			Fp::neg(y, y);
+			yy.clear();
+			CYBOZU_TEST_ASSERT(Ec::getYfromX(yy, x, y.isOdd()));
+			CYBOZU_TEST_EQUAL(yy, y);
+			yy += P.y;
+			CYBOZU_TEST_ASSERT(yy.isZero());
+		}
+
 		Fp x(para.gx);
-		Fp y(para.gy);
-		bool odd = y.isOdd();
-		Fp yy;
-		bool b = Ec::getYfromX(yy, x, odd);
-		CYBOZU_TEST_ASSERT(b);
-		CYBOZU_TEST_EQUAL(yy, y);
-		Fp::neg(y, y);
-		odd = y.isOdd();
-		yy.clear();
-		b = Ec::getYfromX(yy, x, odd);
-		CYBOZU_TEST_ASSERT(b);
-		CYBOZU_TEST_EQUAL(yy, y);
+		for (int i = 0; i < 100; i++) {
+			mpz_class mx = x.getMpz();
+			int ret = mcl::gmp::legendre(mx, Fp::getOp().mp);
+			Fp y;
+			if (Fp::squareRoot(y, x)) {
+				CYBOZU_TEST_EQUAL(y*y, x);
+				CYBOZU_TEST_EQUAL(ret, 1);
+			} else {
+				CYBOZU_TEST_EQUAL(ret, -1);
+			}
+			x += 1;
+		}
 	}
 	void mul_fp() const
 	{
diff --git a/test/sq_test.cpp b/test/sq_test.cpp
index 4c386d23..0ab24c77 100644
--- a/test/sq_test.cpp
+++ b/test/sq_test.cpp
@@ -11,10 +11,15 @@ CYBOZU_TEST_AUTO(sqrt)
 		sq.set(p);
 		for (mpz_class a = 0; a < p; a++) {
 			mpz_class x;
-			if (sq.get(x, a)) {
+			bool b1 = sq.get(x, a);
+			mpz_class x2;
+			bool b2 = sq.get2(x2, a);
+			CYBOZU_TEST_EQUAL(b1, b2);
+			if (b1) {
 				mpz_class y;
 				y = (x * x) % p;
 				CYBOZU_TEST_EQUAL(a, y);
+				CYBOZU_TEST_EQUAL(x, x2);
 			}
 		}
 	}