Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge celo/zexe w/ arkworks #127

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ members = [
"poly-benches",
"test-curves",
"test-templates",

"scripts/glv-lattice-basis",
]

[profile.release]
Expand Down
16 changes: 16 additions & 0 deletions ec/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,25 @@ ark-ff = { path = "../ff", default-features = false }
derivative = { version = "2", features = ["use_core"] }
num-traits = { version = "0.2", default-features = false }
rayon = { version = "1", optional = true }
itertools = { version = "0.9.0", default-features = false }
either = { version = "1.6.0", default-features = false }
thread-id = { version = "3.3.0", optional = true }
backtrace = { version = "0.3", optional = true }
accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true }
peekmore = "0.5.6"
closure = { version = "0.3.0", optional = true }
lazy_static = { version = "1.4.0", optional = true }
serde_json = { version = "1.0.58", optional = true }
dirs = { version = "1.0.5", optional = true }
log = { version = "0.4.11", optional = true }
paste = "0.1"
zeroize = { version = "1", default-features = false, features = ["zeroize_derive"] }

[dev-dependencies]
rand_xorshift = "0.2"

[features]
cuda = [ "std", "parallel", "accel", "lazy_static", "serde_json", "dirs", "closure", "log" ]
default = []
std = [ "ark-std/std", "ark-ff/std", "ark-serialize/std" ]
parallel = [ "std", "rayon", "ark-std/parallel" ]
309 changes: 309 additions & 0 deletions ec/src/batch_arith.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
use crate::AffineCurve;
use ark_ff::{biginteger::BigInteger, fields::Field};
use ark_std::{ops::Neg, vec::Vec};
use either::Either;
use num_traits::Zero;

/// We use a batch size that is big enough to amortise the cost of the actual
/// inversion close to zero while not straining the CPU cache by generating and
/// fetching from large w-NAF tables and slices [G]
pub const BATCH_AFFINE_BATCH_SIZE: usize = 4096;

/// We code this in the second operand for the `batch_add_in_place_read_only`
/// method utilised in the `batch_scalar_mul_in_place` method.
/// 0 == Identity; 1 == Neg; 2 == GLV; 3 == GLV + Neg
pub const ENDO_CODING_BITS: usize = 2;

#[inline(always)]
pub fn decode_endo_from_u32(index_code: u32) -> (usize, u8) {
(
index_code as usize >> ENDO_CODING_BITS,
index_code as u8 % 4,
)
}

pub trait BatchGroupArithmetic
where
Self: Sized + Clone + Copy + Zero + Neg<Output = Self>,
{
type BaseFieldForBatch: Field;

// We use the w-NAF method, achieving point density of approximately 1/(w + 1)
// and requiring storage of only 2^(w - 1).
// Refer to e.g. Improved Techniques for Fast Exponentiation, Section 4
// Bodo M¨oller 2002. https://www.bmoeller.de/pdf/fastexp-icisc2002.pdf

/// Computes [[p_1, 3 * p_1, ..., (2^w - 1) * p_1], ..., [p_n, 3*p_n, ...,
/// (2^w - 1) p_n]] We need to manipulate the offsets when using the
/// table
fn batch_wnaf_tables(bases: &[Self], w: usize) -> Vec<Self> {
let half_size = 1 << (w - 1);
let batch_size = bases.len();

let mut two_a = bases.to_vec();
let instr = (0..batch_size).map(|x| x as u32).collect::<Vec<_>>();
Self::batch_double_in_place(&mut two_a, &instr[..], None);

let mut tables = Vec::<Self>::with_capacity(half_size * batch_size);
tables.extend_from_slice(bases);
let mut scratch_space = Vec::<Option<Self>>::with_capacity((batch_size - 1) / 2 + 1);

for i in 1..half_size {
let instr = (0..batch_size)
.map(|x| (((i - 1) * batch_size + x) as u32, x as u32))
.collect::<Vec<_>>();
Self::batch_add_write_read_self(
&two_a[..],
&instr[..],
&mut tables,
&mut scratch_space,
);
}
tables
}

/// Computes the vectorised version of the wnaf integer recoding
/// Optionally takes a slice of booleans which indicate whether that
/// scalar is negative. If so, it negates the recoding.
/// Mutates scalars in place
fn batch_wnaf_opcode_recoding<BigInt: BigInteger>(
scalars: &mut [BigInt],
w: usize,
negate: Option<&[bool]>,
) -> Vec<Vec<Option<i16>>> {
debug_assert!(w > 0);
let batch_size = scalars.len();
let window_size: i16 = 1 << (w + 1);
let half_window_size: i16 = 1 << w;

let mut op_code_vectorised = Vec::<Vec<Option<i16>>>::with_capacity(BigInt::NUM_LIMBS * 64);

let mut all_none = false;

if negate.is_some() {
debug_assert_eq!(scalars.len(), negate.unwrap().len()); // precompute
}

let f = false;
while !all_none {
let iter = match negate {
None => Either::Left(core::iter::repeat(&f).take(batch_size)),
Some(bools) => Either::Right(bools.iter()),
};
let mut opcode_row = Vec::with_capacity(batch_size);
for (s, &neg) in scalars.iter_mut().zip(iter) {
if s.is_zero() {
opcode_row.push(None);
} else {
let op = if s.is_odd() {
let mut z: i16 = (s.as_ref()[0] % (1 << (w + 1))) as i16;
if z < half_window_size {
s.sub_noborrow(&BigInt::from(z as u64));
} else {
z = z - window_size;
s.add_nocarry(&BigInt::from((-z) as u64));
}
if neg {
-z
} else {
z
}
} else {
0
};
opcode_row.push(Some(op));
s.div2();
}
}
all_none = opcode_row.iter().all(|x| x.is_none());
if !all_none {
op_code_vectorised.push(opcode_row);
}
}
op_code_vectorised
}

// We define a series of batched primitive EC ops, each of which is most
// suitable to a given scenario.
//
// We encode the indexes as u32s to save on fetch latency via better cacheing.
// The principle we are applying is that the len of the batch ops should
// never exceed about 2^20, and the table size would never exceed 2^10, so
// 32 bits will always be enough

/// Mutates bases to be doubled in place
/// Accepts optional scratch space which might help by reducing the
/// number of heap allocations for the Vector-based scratch_space
fn batch_double_in_place(
bases: &mut [Self],
index: &[u32],
scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
);

/// Mutates bases in place and stores result in the first operand.
/// The element corresponding to the second operand becomes junk data.
fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]);

/// Mutates bases in place and stores result in bases.
/// The elements in other become junk data.
fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]);

/// Adds elements in bases with elements in other (for instance, a table),
/// utilising a scratch space to store intermediate results.
fn batch_add_in_place_read_only(
bases: &mut [Self],
other: &[Self],
index: &[(u32, u32)],
scratch_space: &mut Vec<Self>,
);

/// Lookups up group elements according to index, and either adds and writes
/// or simply writes them to new_elems, using scratch space to store
/// intermediate values. Scratch space is always cleared after use.

/// No-ops, or copies of the elem in the slice `lookup` in the position of
/// the index of the first operand to the new_elems vector, are encoded
/// as !0u32 in the index for the second operand
fn batch_add_write(
lookup: &[Self],
index: &[(u32, u32)],
new_elems: &mut Vec<Self>,
scratch_space: &mut Vec<Option<Self>>,
);

/// Similar to batch_add_write, only that the lookup for the first operand
/// is performed in new_elems rather than lookup

/// No-ops, or copies of the elem in the slice `lookup` in the position of
/// the index of the first operand to the new_elems vector, are encoded
/// as !0u32 in the index for the second operand
fn batch_add_write_read_self(
lookup: &[Self],
index: &[(u32, u32)],
new_elems: &mut Vec<Self>,
scratch_space: &mut Vec<Option<Self>>,
);

/// Performs a batch scalar multiplication using the w-NAF encoding
/// utilising the primitive batched ops
fn batch_scalar_mul_in_place<BigInt: BigInteger>(
mut bases: &mut [Self],
scalars: &mut [BigInt],
w: usize,
) {
let batch_size = bases.len();
let opcode_vectorised = Self::batch_wnaf_opcode_recoding::<BigInt>(scalars, w, None);
let tables = Self::batch_wnaf_tables(bases, w);

// Set all points to 0;
let zero = Self::zero();
for p in bases.iter_mut() {
*p = zero;
}

for opcode_row in opcode_vectorised.iter().rev() {
let index_double: Vec<_> = opcode_row
.iter()
.enumerate()
.filter(|x| x.1.is_some())
.map(|x| x.0 as u32)
.collect();

Self::batch_double_in_place(&mut bases, &index_double[..], None);

let mut add_ops: Vec<Self> = opcode_row
.iter()
.enumerate()
.filter(|(_, op)| op.is_some() && op.unwrap() != 0)
.map(|(i, op)| {
let idx = op.unwrap();
if idx > 0 {
tables[(idx as usize) / 2 * batch_size + i].clone()
} else {
tables[(-idx as usize) / 2 * batch_size + i].clone().neg()
}
})
.collect();

let index_add: Vec<_> = opcode_row
.iter()
.enumerate()
.filter(|(_, op)| op.is_some() && op.unwrap() != 0)
.map(|x| x.0)
.enumerate()
.map(|(x, y)| (y as u32, x as u32))
.collect();

Self::batch_add_in_place(&mut bases, &mut add_ops[..], &index_add[..]);
}
}

/// Chunks vectorised instructions into a size that does not require
/// storing a lot of intermediate state
fn get_chunked_instr<T: Clone>(instr: &[T], batch_size: usize) -> Vec<Vec<T>> {
let mut res = Vec::new();

let rem = instr.chunks_exact(batch_size).remainder();
let mut chunks = instr.chunks_exact(batch_size).peekable();

if chunks.len() == 0 {
res.push(rem.to_vec());
}

while let Some(chunk) = chunks.next() {
let chunk = if chunks.peek().is_none() {
[chunk, rem].concat()
} else {
chunk.to_vec()
};
res.push(chunk);
}
res
}
}

/// We make the syntax for performing batch ops on slices cleaner
/// by defining a corresponding trait and impl for [G] rather than on G
pub trait BatchGroupArithmeticSlice<G: AffineCurve> {
fn batch_double_in_place(&mut self, index: &[u32]);

fn batch_add_in_place_same_slice(&mut self, index: &[(u32, u32)]);

fn batch_add_in_place(&mut self, other: &mut Self, index: &[(u32, u32)]);

fn batch_add_write(
&self,
index: &[(u32, u32)],
new_elems: &mut Vec<G>,
scratch_space: &mut Vec<Option<G>>,
);

fn batch_scalar_mul_in_place<BigInt: BigInteger>(&mut self, scalars: &mut [BigInt], w: usize);
}

impl<G: AffineCurve> BatchGroupArithmeticSlice<G> for [G] {
fn batch_double_in_place(&mut self, index: &[u32]) {
G::batch_double_in_place(self, index, None);
}

fn batch_add_in_place_same_slice(&mut self, index: &[(u32, u32)]) {
G::batch_add_in_place_same_slice(self, index);
}

fn batch_add_in_place(&mut self, other: &mut Self, index: &[(u32, u32)]) {
G::batch_add_in_place(self, other, index);
}

fn batch_add_write(
&self,
index: &[(u32, u32)],
new_elems: &mut Vec<G>,
scratch_space: &mut Vec<Option<G>>,
) {
G::batch_add_write(self, index, new_elems, scratch_space);
}

fn batch_scalar_mul_in_place<BigInt: BigInteger>(&mut self, scalars: &mut [BigInt], w: usize) {
G::batch_scalar_mul_in_place(self, scalars, w);
}
}
Loading