Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster pcurves reductions for P-256 and P-384 #4147

Merged
merged 2 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lib/math/pcurves/info.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pcurves.h
pcurves_id.h
pcurves_impl.h
pcurves_util.h
pcurves_solinas.h
pcurves_wrap.h
pcurves_instance.h
</header:internal>
114 changes: 110 additions & 4 deletions src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,115 @@

#include <botan/internal/pcurves_instance.h>

#include <botan/internal/pcurves_solinas.h>
#include <botan/internal/pcurves_wrap.h>

namespace Botan::PCurve {

namespace {

// clang-format off
template <typename Params>
class Secp256r1Rep final {
public:
static constexpr auto P = Params::P;
static constexpr size_t N = Params::N;
typedef typename Params::W W;

// Adds 4 * P-256 to prevent underflow
static constexpr auto P256_4 =
hex_to_words<uint32_t>("0x3fffffffc00000004000000000000000000000003fffffffffffffffffffffffc");
Comment on lines +24 to +25
Copy link
Collaborator

@reneme reneme Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you decide to adopt my suggestion, it would be worthwhile to replace this by a constexpr multiplication (bigmul(4, Params::P)) to save a magic string/number, if we have that somewhere already.


constexpr static std::array<W, N> redc(const std::array<W, 2 * N>& z) {
const int64_t X00 = get_uint32(z.data(), 0);
const int64_t X01 = get_uint32(z.data(), 1);
const int64_t X02 = get_uint32(z.data(), 2);
const int64_t X03 = get_uint32(z.data(), 3);
const int64_t X04 = get_uint32(z.data(), 4);
const int64_t X05 = get_uint32(z.data(), 5);
const int64_t X06 = get_uint32(z.data(), 6);
const int64_t X07 = get_uint32(z.data(), 7);
const int64_t X08 = get_uint32(z.data(), 8);
const int64_t X09 = get_uint32(z.data(), 9);
const int64_t X10 = get_uint32(z.data(), 10);
const int64_t X11 = get_uint32(z.data(), 11);
const int64_t X12 = get_uint32(z.data(), 12);
const int64_t X13 = get_uint32(z.data(), 13);
const int64_t X14 = get_uint32(z.data(), 14);
const int64_t X15 = get_uint32(z.data(), 15);

// See SP 800-186 section G.1.2
const int64_t S0 = P256_4[0] + X00 + X08 + X09 - (X11 + X12 + X13 + X14);
const int64_t S1 = P256_4[1] + X01 + X09 + X10 - (X12 + X13 + X14 + X15);
const int64_t S2 = P256_4[2] + X02 + X10 + X11 - (X13 + X14 + X15);
const int64_t S3 = P256_4[3] + X03 + 2 * (X11 + X12) + X13 - (X15 + X08 + X09);
const int64_t S4 = P256_4[4] + X04 + 2 * (X12 + X13) + X14 - (X09 + X10);
const int64_t S5 = P256_4[5] + X05 + 2 * (X13 + X14) + X15 - (X10 + X11);
const int64_t S6 = P256_4[6] + X06 + X13 + X14 * 3 + X15 * 2 - (X08 + X09);
const int64_t S7 = P256_4[7] + X07 + X15 * 3 + X08 - (X10 + X11 + X12 + X13);
const int64_t S8 = P256_4[8];

std::array<W, N> r = {};

SolinasAccum sum(r);

sum.accum(S0);
sum.accum(S1);
sum.accum(S2);
sum.accum(S3);
sum.accum(S4);
sum.accum(S5);
sum.accum(S6);
sum.accum(S7);
const auto S = sum.final_carry(S8);

BOTAN_DEBUG_ASSERT(S <= 8);

const auto correction = p256_mul_mod_256(S);
W borrow = bigint_sub2(r.data(), N, correction.data(), N);

bigint_cnd_add(borrow, r.data(), N, P.data(), N);

return r;
}

constexpr static std::array<W, N> one() { return std::array<W, N>{1}; }

constexpr static std::array<W, N> to_rep(const std::array<W, N>& x) { return x; }

constexpr static std::array<W, N> wide_to_rep(const std::array<W, 2 * N>& x) { return redc(x); }

constexpr static std::array<W, N> from_rep(const std::array<W, N>& z) { return z; }

private:
// Return (i*P-256) % 2**256
//
// Assumes i is small
constexpr static std::array<W, N> p256_mul_mod_256(W i) {
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

// For small i, multiples of P-256 have a simple structure so it's faster to
// compute the value directly vs a (constant time) table lookup

auto r = P;
if constexpr(WordInfo<W>::bits == 32) {
r[7] -= i;
r[6] += i;
r[3] += i;
r[0] -= i;
} else {
const uint64_t i32 = static_cast<uint64_t>(i) << 32;
r[3] -= i32;
r[3] += i;
r[1] += i32;
r[0] -= i;
}
return r;
}
};

namespace secp256r1 {

// clang-format off
class Params final : public EllipticCurveParameters<
"FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF",
"FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFC",
Expand All @@ -25,11 +125,17 @@ class Params final : public EllipticCurveParameters<
-10> {
};

class Curve final : public EllipticCurve<Params> {};
// clang-format on

}
#if BOTAN_MP_WORD_BITS == 32
// Secp256r1Rep works for 64 bit also, but is at best marginally faster at least
// on compilers/CPUs tested so far
class Curve final : public EllipticCurve<Params, Secp256r1Rep> {};
#else
class Curve final : public EllipticCurve<Params> {};
#endif

// clang-format on
} // namespace secp256r1

} // namespace

Expand Down
113 changes: 112 additions & 1 deletion src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,123 @@

#include <botan/internal/pcurves_instance.h>

#include <botan/internal/pcurves_solinas.h>
#include <botan/internal/pcurves_wrap.h>

namespace Botan::PCurve {

namespace {

template <typename Params>
class Secp384r1Rep final {
public:
static constexpr auto P = Params::P;
static constexpr size_t N = Params::N;
typedef typename Params::W W;

constexpr static std::array<W, N> redc(const std::array<W, 2 * N>& z) {
const int64_t X00 = get_uint32(z.data(), 0);
const int64_t X01 = get_uint32(z.data(), 1);
const int64_t X02 = get_uint32(z.data(), 2);
const int64_t X03 = get_uint32(z.data(), 3);
const int64_t X04 = get_uint32(z.data(), 4);
const int64_t X05 = get_uint32(z.data(), 5);
const int64_t X06 = get_uint32(z.data(), 6);
const int64_t X07 = get_uint32(z.data(), 7);
const int64_t X08 = get_uint32(z.data(), 8);
const int64_t X09 = get_uint32(z.data(), 9);
const int64_t X10 = get_uint32(z.data(), 10);
const int64_t X11 = get_uint32(z.data(), 11);
const int64_t X12 = get_uint32(z.data(), 12);
const int64_t X13 = get_uint32(z.data(), 13);
const int64_t X14 = get_uint32(z.data(), 14);
const int64_t X15 = get_uint32(z.data(), 15);
const int64_t X16 = get_uint32(z.data(), 16);
const int64_t X17 = get_uint32(z.data(), 17);
const int64_t X18 = get_uint32(z.data(), 18);
const int64_t X19 = get_uint32(z.data(), 19);
const int64_t X20 = get_uint32(z.data(), 20);
const int64_t X21 = get_uint32(z.data(), 21);
const int64_t X22 = get_uint32(z.data(), 22);
const int64_t X23 = get_uint32(z.data(), 23);

// One copy of P-384 is added to prevent underflow
const int64_t S0 = 0xFFFFFFFF + X00 + X12 + X20 + X21 - X23;
const int64_t S1 = 0x00000000 + X01 + X13 + X22 + X23 - X12 - X20;
const int64_t S2 = 0x00000000 + X02 + X14 + X23 - X13 - X21;
const int64_t S3 = 0xFFFFFFFF + X03 + X12 + X15 + X20 + X21 - X14 - X22 - X23;
const int64_t S4 = 0xFFFFFFFE + X04 + X12 + X13 + X16 + X20 + X21 * 2 + X22 - X15 - X23 * 2;
const int64_t S5 = 0xFFFFFFFF + X05 + X13 + X14 + X17 + X21 + X22 * 2 + X23 - X16;
const int64_t S6 = 0xFFFFFFFF + X06 + X14 + X15 + X18 + X22 + X23 * 2 - X17;
const int64_t S7 = 0xFFFFFFFF + X07 + X15 + X16 + X19 + X23 - X18;
const int64_t S8 = 0xFFFFFFFF + X08 + X16 + X17 + X20 - X19;
const int64_t S9 = 0xFFFFFFFF + X09 + X17 + X18 + X21 - X20;
const int64_t SA = 0xFFFFFFFF + X10 + X18 + X19 + X22 - X21;
const int64_t SB = 0xFFFFFFFF + X11 + X19 + X20 + X23 - X22;

std::array<W, N> r = {};

SolinasAccum sum(r);

sum.accum(S0);
sum.accum(S1);
sum.accum(S2);
sum.accum(S3);
sum.accum(S4);
sum.accum(S5);
sum.accum(S6);
sum.accum(S7);
sum.accum(S8);
sum.accum(S9);
sum.accum(SA);
sum.accum(SB);
const auto S = sum.final_carry(0);

BOTAN_DEBUG_ASSERT(S <= 4);

const auto correction = p384_mul_mod_384(S);
W borrow = bigint_sub2(r.data(), N, correction.data(), N);

bigint_cnd_add(borrow, r.data(), N, P.data(), N);

return r;
}

constexpr static std::array<W, N> one() { return std::array<W, N>{1}; }

constexpr static std::array<W, N> to_rep(const std::array<W, N>& x) { return x; }

constexpr static std::array<W, N> wide_to_rep(const std::array<W, 2 * N>& x) { return redc(x); }

constexpr static std::array<W, N> from_rep(const std::array<W, N>& z) { return z; }

private:
// Return (i*P-384) % 2**384
//
// Assumes i is small
constexpr static std::array<W, N> p384_mul_mod_384(W i) {
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

// For small i, multiples of P-384 have a simple structure so it's faster to
// compute the value directly vs a (constant time) table lookup

auto r = P;
if constexpr(WordInfo<W>::bits == 32) {
r[4] -= i;
r[3] -= i;
r[1] += i;
r[0] -= i;
} else {
const uint64_t i32 = static_cast<uint64_t>(i) << 32;
r[2] -= i;
r[1] -= i32;
r[0] += i32;
r[0] -= i;
}
return r;
}
};

// clang-format off
namespace secp384r1 {

Expand All @@ -25,7 +136,7 @@ class Params final : public EllipticCurveParameters<
-12> {
};

class Curve final : public EllipticCurve<Params> {};
class Curve final : public EllipticCurve<Params, Secp384r1Rep> {};

}

Expand Down
83 changes: 83 additions & 0 deletions src/lib/math/pcurves/pcurves_solinas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* (C) 2024 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/

#ifndef BOTAN_PCURVES_SOLINAS_REDC_HELPER_H_
#define BOTAN_PCURVES_SOLINAS_REDC_HELPER_H_

#include <botan/internal/mp_core.h>

namespace Botan {

/*
Helpers for modular reduction of Solinas primes, such as P-256 and P-384.

Instead of explicitly forming the various integers and adding/subtracting them
row-by-row, we compute the entire sum in one pass, column by column. To prevent
overflow/underflow the accumulator is a signed 64-bit integer, while the various
limbs are (at least for all NIST curves aside from P-192) 32 bit integers.

For more background on Solinas primes / Solinas reduction see

* J. Solinas 'Generalized Mersenne Numbers'
<https://cacr.uwaterloo.ca/techreports/1999/corr99-39.pdf>
* NIST SP 800-186 Appendix G.1
<https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-186.pdf>
* Handbook of Elliptic And Hyperelliptic Curve Cryptography § 10.4.3

*/

template <WordType W>
constexpr uint32_t get_uint32(const W xw[], size_t i) {
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

if constexpr(WordInfo<W>::bits == 32) {
return xw[i];
} else {
return static_cast<uint32_t>(xw[i / 2] >> ((i % 2) * 32));
}
}

template <WordType W, size_t N>
class SolinasAccum {
public:
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

static constexpr size_t N32 = N * (WordInfo<W>::bits / 32);

SolinasAccum(std::array<W, N>& r) : m_r(r), m_S(0), m_idx(0) {}

void accum(int64_t v) {
BOTAN_DEBUG_ASSERT(m_idx < N32);

m_S += v;
const uint32_t r = static_cast<uint32_t>(m_S);
m_S >>= 32;

if constexpr(WordInfo<W>::bits == 32) {
m_r[m_idx] = r;
} else {
m_r[m_idx / 2] |= static_cast<uint64_t>(r) << (32 * (m_idx % 2));
}

m_idx += 1;
}

W final_carry(int64_t C) {
BOTAN_DEBUG_ASSERT(m_idx == N32);
m_S += C;
BOTAN_DEBUG_ASSERT(m_S >= 0);
return static_cast<W>(m_S);
}

private:
std::array<W, N>& m_r;
int64_t m_S;
size_t m_idx;
};

} // namespace Botan

#endif
Loading