Skip to content

Commit

Permalink
mirage-crypto-ec: 25519 use big pre-computed tables
Browse files Browse the repository at this point in the history
taken from boringssl (with OPENSSL_SMALL not defined)

performance for sign is improved by a factor of around 2
  • Loading branch information
hannesm committed Feb 18, 2024
1 parent 4e8790d commit 3aaf6dd
Show file tree
Hide file tree
Showing 2 changed files with 3,011 additions and 163 deletions.
203 changes: 129 additions & 74 deletions ec/native/curve25519_stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,21 +177,12 @@ static void fe_0(fe *h) {
memset(h, 0, sizeof(fe));
}

static void fe_loose_0(fe_loose *h) {
memset(h, 0, sizeof(fe_loose));
}

// h = 1
static void fe_1(fe *h) {
memset(h, 0, sizeof(fe));
h->v[0] = 1;
}

static void fe_loose_1(fe_loose *h) {
memset(h, 0, sizeof(fe_loose));
h->v[0] = 1;
}

// h = f + g
// Can overlap h with f or g.
static void fe_add(fe_loose *h, const fe *f, const fe *g) {
Expand Down Expand Up @@ -229,10 +220,6 @@ static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}

static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}

static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}
Expand Down Expand Up @@ -547,12 +534,6 @@ static void ge_p3_0(ge_p3 *h) {
fe_0(&h->T);
}

static void ge_precomp_0(ge_precomp *h) {
fe_loose_1(&h->yplusx);
fe_loose_1(&h->yminusx);
fe_loose_0(&h->xy2d);
}

// r = p
static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
fe_copy(&r->X, &p->X);
Expand Down Expand Up @@ -679,78 +660,152 @@ static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
fe_add(&r->T, &trZ, &trT);
}

static uint8_t equal(signed char b, signed char c) {
uint8_t ub = b;
uint8_t uc = c;
uint8_t x = ub ^ uc; // 0: yes; 1..255: no
uint32_t y = x; // 0: yes; 1..255: no
y -= 1; // 4294967295: yes; 0..254: no
y >>= 31; // 1: yes; 0: no
return y;
}

static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
fe_cmov(&t->yplusx, &u->yplusx, b);
fe_cmov(&t->yminusx, &u->yminusx, b);
fe_cmov(&t->xy2d, &u->xy2d, b);
}

static void x25519_ge_scalarmult_small_precomp(
ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
// precomp_table is first expanded into matching |ge_precomp|
// elements.
ge_precomp multiples[15];

unsigned i;
for (i = 0; i < 15; i++) {
// The precomputed table is assumed to already clear the top bit, so
// |fe_frombytes_strict| may be used directly.
const uint8_t *bytes = &precomp_table[i*(2 * 32)];
fe x, y;
fe_frombytes_strict(&x, bytes);
fe_frombytes_strict(&y, bytes + 32);

ge_precomp *out = &multiples[i];
fe_add(&out->yplusx, &y, &x);
fe_sub(&out->yminusx, &y, &x);
fe_mul_ltt(&out->xy2d, &x, &y);
fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
// from crypto/internal.h
// constant_time_msb_w returns the given value with the MSB copied to all the
// other bits.
static inline WORD constant_time_msb_w(WORD a) {
return 0u - (a >> (sizeof(a) * 8 - 1));
}

// constant_time_is_zero returns 0xff..f if a == 0 and 0 otherwise.
static inline WORD constant_time_is_zero_w(WORD a) {
// Here is an SMT-LIB verification of this formula:
//
// (define-fun is_zero ((a (_ BitVec 32))) (_ BitVec 32)
// (bvand (bvnot a) (bvsub a #x00000001))
// )
//
// (declare-fun a () (_ BitVec 32))
//
// (assert (not (= (= #x00000001 (bvlshr (is_zero a) #x0000001f)) (= a #x00000000))))
// (check-sat)
// (get-model)
return constant_time_msb_w(~a & (a - 1));
}

// constant_time_eq_w returns 0xff..f if a == b and 0 otherwise.
static inline WORD constant_time_eq_w(WORD a, WORD b) {
return constant_time_is_zero_w(a ^ b);
}

// value_barrier_w returns |a|, but prevents GCC and Clang from reasoning about
// the returned value. This is used to mitigate compilers undoing constant-time
// code, until we can express our requirements directly in the language.
//
// Note the compiler is aware that |value_barrier_w| has no side effects and
// always has the same output for a given input. This allows it to eliminate
// dead code, move computations across loops, and vectorize.
static inline WORD value_barrier_w(WORD a) {
#if defined(__GNUC__) || defined(__clang__)
__asm__("" : "+r"(a) : /* no inputs */);
#endif
return a;
}

// constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if
// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
static inline void constant_time_conditional_memxor(void *dst, const void *src,
const size_t n,
const WORD mask) {
assert(!buffers_alias(dst, n, src, n));
uint8_t *out = (uint8_t *)dst;
const uint8_t *in = (const uint8_t *)src;
for (size_t i = 0; i < n; i++) {
out[i] ^= value_barrier_w(mask) & in[i];
}
}

// See the comment above |k25519SmallPrecomp| about the structure of the
// precomputed elements. This loop does 64 additions and 64 doublings to
// calculate the result.
ge_p3_0(h);
static void table_select(ge_precomp *t, const int pos, const signed char b) {
uint8_t bnegative = constant_time_msb_w(b);
uint8_t babs = b - ((bnegative & b) << 1);

for (i = 63; i < 64; i--) {
unsigned j;
signed char index = 0;
uint8_t t_bytes[3][32] = {
{constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
#if defined(__clang__) // materialize for vectorization, 6% speedup
__asm__("" : "+m" (t_bytes) : /*no inputs*/);
#endif
static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
for (int i = 0; i < 8; i++) {
constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
sizeof(t_bytes),
constant_time_eq_w(babs, 1 + i));
}

for (j = 0; j < 4; j++) {
const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
index |= (bit << j);
}
fe yplusx, yminusx, xy2d;
fe_frombytes_strict(&yplusx, t_bytes[0]);
fe_frombytes_strict(&yminusx, t_bytes[1]);
fe_frombytes_strict(&xy2d, t_bytes[2]);

ge_precomp e;
ge_precomp_0(&e);
fe_copy_lt(&t->yplusx, &yplusx);
fe_copy_lt(&t->yminusx, &yminusx);
fe_copy_lt(&t->xy2d, &xy2d);

for (j = 1; j < 16; j++) {
cmov(&e, &multiples[j-1], equal(index, j));
}
ge_precomp minust;
fe_copy_lt(&minust.yplusx, &yminusx);
fe_copy_lt(&minust.yminusx, &yplusx);
fe_neg(&minust.xy2d, &xy2d);
cmov(t, &minust, bnegative>>7);
}

ge_cached cached;
ge_p1p1 r;
x25519_ge_p3_to_cached(&cached, h);
x25519_ge_add(&r, h, &cached);
x25519_ge_p1p1_to_p3(h, &r);
// h = a * B
// where a = a[0]+256*a[1]+...+256^31 a[31]
// B is the Ed25519 base point (x,4/5) with x positive.
//
// Preconditions:
// a[31] <= 127
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
signed char e[64];
signed char carry;
ge_p1p1 r;
ge_p2 s;
ge_precomp t;
int i;

for (i = 0; i < 32; ++i) {
e[2 * i + 0] = (a[i] >> 0) & 15;
e[2 * i + 1] = (a[i] >> 4) & 15;
}
// each e[i] is between 0 and 15
// e[63] is between 0 and 7

carry = 0;
for (i = 0; i < 63; ++i) {
e[i] += carry;
carry = e[i] + 8;
carry >>= 4;
e[i] -= carry << 4;
}
e[63] += carry;
// each e[i] is between -8 and 8

ge_madd(&r, h, &e);
ge_p3_0(h);
for (i = 1; i < 64; i += 2) {
table_select(&t, i / 2, e[i]);
ge_madd(&r, h, &t);
x25519_ge_p1p1_to_p3(h, &r);
}
}

static void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
ge_p3_dbl(&r, h);
x25519_ge_p1p1_to_p2(&s, &r);
ge_p2_dbl(&r, &s);
x25519_ge_p1p1_to_p2(&s, &r);
ge_p2_dbl(&r, &s);
x25519_ge_p1p1_to_p2(&s, &r);
ge_p2_dbl(&r, &s);
x25519_ge_p1p1_to_p3(h, &r);

for (i = 0; i < 64; i += 2) {
table_select(&t, i / 2, e[i]);
ge_madd(&r, h, &t);
x25519_ge_p1p1_to_p3(h, &r);
}
}

static void slide(signed char *r, const uint8_t *a) {
Expand Down
Loading

0 comments on commit 3aaf6dd

Please sign in to comment.