Skip to content

Commit 5fa83f9

Browse files
committed
Optimize blake2b, 2s algorithms
1 parent aa7ea72 commit 5fa83f9

File tree

3 files changed

+53
-73
lines changed

3 files changed

+53
-73
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,5 @@ wasm
3030

3131
docs
3232
benchmark*.js
33+
34+
test.html

src/blake2b.c

+20-28
Original file line numberDiff line numberDiff line change
@@ -119,26 +119,27 @@ static __inline__ void blake2b_increment_counter(const uint64_t inc) {
119119
b = rotr64(b ^ c, 63); \
120120
} while (0)
121121

122-
#define ROUND(r) \
123-
do { \
124-
G(r, 0, v[0], v[4], v[8], v[12]); \
125-
G(r, 1, v[1], v[5], v[9], v[13]); \
126-
G(r, 2, v[2], v[6], v[10], v[14]); \
127-
G(r, 3, v[3], v[7], v[11], v[15]); \
128-
G(r, 4, v[0], v[5], v[10], v[15]); \
129-
G(r, 5, v[1], v[6], v[11], v[12]); \
130-
G(r, 6, v[2], v[7], v[8], v[13]); \
131-
G(r, 7, v[3], v[4], v[9], v[14]); \
132-
} while (0)
122+
static void round(uint32_t r, uint64_t m[16], uint64_t v[16]) {
123+
G(r, 0, v[0], v[4], v[8], v[12]);
124+
G(r, 1, v[1], v[5], v[9], v[13]);
125+
G(r, 2, v[2], v[6], v[10], v[14]);
126+
G(r, 3, v[3], v[7], v[11], v[15]);
127+
G(r, 4, v[0], v[5], v[10], v[15]);
128+
G(r, 5, v[1], v[6], v[11], v[12]);
129+
G(r, 6, v[2], v[7], v[8], v[13]);
130+
G(r, 7, v[3], v[4], v[9], v[14]);
131+
};
133132

134133
static void blake2b_compress(const uint8_t block[BLAKE2B_BLOCKBYTES]) {
135134
uint64_t m[16];
136135
uint64_t v[16];
137136

137+
#pragma clang loop unroll(full)
138138
for (int i = 0; i < 16; ++i) {
139139
m[i] = load64(block + i * sizeof(m[i]));
140140
}
141141

142+
#pragma clang loop unroll(full)
142143
for (int i = 0; i < 8; ++i) {
143144
v[i] = S->h[i];
144145
}
@@ -152,26 +153,18 @@ static void blake2b_compress(const uint8_t block[BLAKE2B_BLOCKBYTES]) {
152153
v[14] = blake2b_IV[6] ^ S->f[0];
153154
v[15] = blake2b_IV[7] ^ S->f[1];
154155

155-
ROUND(0);
156-
ROUND(1);
157-
ROUND(2);
158-
ROUND(3);
159-
ROUND(4);
160-
ROUND(5);
161-
ROUND(6);
162-
ROUND(7);
163-
ROUND(8);
164-
ROUND(9);
165-
ROUND(10);
166-
ROUND(11);
156+
#pragma clang loop unroll(full)
157+
for (int i = 0; i < 12; ++i) {
158+
round(i, m, v);
159+
}
167160

161+
#pragma clang loop unroll(full)
168162
for (int i = 0; i < 8; ++i) {
169163
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
170164
}
171165
}
172166

173167
#undef G
174-
#undef ROUND
175168

176169
void blake2b_update(const void *pin, int inlen) {
177170
const unsigned char *in = (const unsigned char *)pin;
@@ -229,9 +222,7 @@ void Hash_Final() {
229222
}
230223

231224
static void blake2b_init0() {
232-
for (int i = 0; i < sizeof(blake2b_state); i++) {
233-
((uint8_t*)S)[i] = 0;
234-
}
225+
memset(S, 0, sizeof(blake2b_state));
235226

236227
for (int i = 0; i < 8; ++i) {
237228
S->h[i] = blake2b_IV[i];
@@ -270,7 +261,8 @@ void blake2b_init_key(int outlen, const uint8_t *key, int keylen) {
270261
blake2b_init_param();
271262

272263
if (keylen > 0) {
273-
uint8_t block[BLAKE2B_BLOCKBYTES] = { 0 };
264+
uint8_t block[BLAKE2B_BLOCKBYTES];
265+
memset128(block, 0);
274266
for (uint8_t i = 0; i < keylen; i++) {
275267
block[i] = key[i];
276268
}

src/blake2s.c

+31-45
Original file line numberDiff line numberDiff line change
@@ -114,57 +114,47 @@ static __inline__ void blake2s_increment_counter(const uint32_t inc) {
114114
b = rotr32(b ^ c, 7); \
115115
} while (0)
116116

117-
#define ROUND(r) \
118-
do { \
119-
G(r, 0, v[0], v[4], v[8], v[12]); \
120-
G(r, 1, v[1], v[5], v[9], v[13]); \
121-
G(r, 2, v[2], v[6], v[10], v[14]); \
122-
G(r, 3, v[3], v[7], v[11], v[15]); \
123-
G(r, 4, v[0], v[5], v[10], v[15]); \
124-
G(r, 5, v[1], v[6], v[11], v[12]); \
125-
G(r, 6, v[2], v[7], v[8], v[13]); \
126-
G(r, 7, v[3], v[4], v[9], v[14]); \
127-
} while (0)
117+
static void round(uint32_t r, uint32_t m[16], uint32_t v[16]) {
118+
G(r, 0, v[0], v[4], v[8], v[12]);
119+
G(r, 1, v[1], v[5], v[9], v[13]);
120+
G(r, 2, v[2], v[6], v[10], v[14]);
121+
G(r, 3, v[3], v[7], v[11], v[15]);
122+
G(r, 4, v[0], v[5], v[10], v[15]);
123+
G(r, 5, v[1], v[6], v[11], v[12]);
124+
G(r, 6, v[2], v[7], v[8], v[13]);
125+
G(r, 7, v[3], v[4], v[9], v[14]);
126+
}
128127

129128
static void blake2s_compress(const uint8_t block[BLAKE2S_BLOCKBYTES]) {
130129
uint32_t m[16];
131130
uint32_t v[16];
132131

133-
for (int i = 0; i < 16; ++i) {
134-
m[i] = load32(block + i * sizeof(m[i]));
135-
}
132+
memcpy64(m, block);
136133

137-
for (int i = 0; i < 8; ++i) {
138-
v[i] = S->h[i];
139-
}
134+
memcpy32(v, S->h);
140135

141-
v[8] = blake2s_IV[0];
142-
v[9] = blake2s_IV[1];
143-
v[10] = blake2s_IV[2];
144-
v[11] = blake2s_IV[3];
145-
v[12] = blake2s_IV[4] ^ S->t[0];
146-
v[13] = blake2s_IV[5] ^ S->t[1];
147-
v[14] = blake2s_IV[6] ^ S->f[0];
148-
v[15] = blake2s_IV[7] ^ S->f[1];
149-
150-
ROUND(0);
151-
ROUND(1);
152-
ROUND(2);
153-
ROUND(3);
154-
ROUND(4);
155-
ROUND(5);
156-
ROUND(6);
157-
ROUND(7);
158-
ROUND(8);
159-
ROUND(9);
136+
uint64_t* v64 = (uint64_t*)v;
137+
uint64_t* blake2s_IV64 = (uint64_t*)blake2s_IV;
138+
uint64_t* st64 = (uint64_t*)S->t;
139+
uint64_t* sf64 = (uint64_t*)S->f;
140+
v64[4] = blake2s_IV64[0];
141+
v64[5] = blake2s_IV64[1];
142+
v64[6] = blake2s_IV64[2] ^ st64[0];
143+
v64[7] = blake2s_IV64[3] ^ sf64[0];
160144

161-
for (int i = 0; i < 8; ++i) {
162-
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
145+
#pragma clang loop unroll(full)
146+
for (int i = 0; i < 10; ++i) {
147+
round(i, m, v);
148+
}
149+
150+
uint64_t* sh64 = (uint64_t*)S->h;
151+
#pragma clang loop unroll(full)
152+
for (int i = 0; i < 4; ++i) {
153+
sh64[i] = sh64[i] ^ v64[i] ^ v64[i + 4];
163154
}
164155
}
165156

166157
#undef G
167-
#undef ROUND
168158

169159
void blake2s_update(const void *pin, int inlen) {
170160
const unsigned char *in = (const unsigned char *)pin;
@@ -174,9 +164,7 @@ void blake2s_update(const void *pin, int inlen) {
174164
if (inlen > fill) {
175165
S->buflen = 0;
176166
/* Fill buffer */
177-
for (uint8_t i = 0; i < fill; i++) {
178-
S->buf[left + i] = in[i];
179-
}
167+
memcpy(&S->buf[left], in, fill);
180168
blake2s_increment_counter(BLAKE2S_BLOCKBYTES);
181169
blake2s_compress(S->buf); /* Compress */
182170
in += fill;
@@ -188,9 +176,7 @@ void blake2s_update(const void *pin, int inlen) {
188176
inlen -= BLAKE2S_BLOCKBYTES;
189177
}
190178
}
191-
for (uint8_t i = 0; i < inlen; i++) {
192-
S->buf[S->buflen + i] = in[i];
193-
}
179+
memcpy(&S->buf[S->buflen], in, inlen);
194180
S->buflen += inlen;
195181
}
196182
}

0 commit comments

Comments
 (0)