Skip to content

Commit 8579bce

Browse files
committed
crypto/internal/fips140/aes: optimize amd64
Implement overflow-aware optimization in ctrBlocks8Asm: make a fast branch in case when there is no overflow. One branch per 8 blocks is faster than 7 increments in general purpose registers and transfers from them to XMM. Added AES-192 and AES-256 modes to the AES-CTR benchmark. Added a correctness test in ctr_aes_test.go for the overflow optimization. This improves performance, especially in AES-128 mode. goos: windows goarch: amd64 pkg: crypto/cipher cpu: AMD Ryzen 7 5800H with Radeon Graphics │ B/s │ B/s vs base AESCTR/128/50-16 1.377Gi ± 0% 1.384Gi ± 0% +0.51% (p=0.028 n=20) AESCTR/128/1K-16 6.164Gi ± 0% 6.892Gi ± 1% +11.81% (p=0.000 n=20) AESCTR/128/8K-16 7.372Gi ± 0% 8.768Gi ± 1% +18.95% (p=0.000 n=20) AESCTR/192/50-16 1.289Gi ± 0% 1.279Gi ± 0% -0.75% (p=0.001 n=20) AESCTR/192/1K-16 5.734Gi ± 0% 6.011Gi ± 0% +4.83% (p=0.000 n=20) AESCTR/192/8K-16 6.889Gi ± 1% 7.437Gi ± 0% +7.96% (p=0.000 n=20) AESCTR/256/50-16 1.170Gi ± 0% 1.163Gi ± 0% -0.54% (p=0.005 n=20) AESCTR/256/1K-16 5.235Gi ± 0% 5.391Gi ± 0% +2.98% (p=0.000 n=20) AESCTR/256/8K-16 6.361Gi ± 0% 6.676Gi ± 0% +4.94% (p=0.000 n=20) geomean 3.681Gi 3.882Gi +5.46% The slight slowdown on 50-byte workloads is unrelated to this change, because such workloads never use ctrBlocks8Asm.
1 parent 3dd9fc0 commit 8579bce

File tree

4 files changed

+186
-30
lines changed

4 files changed

+186
-30
lines changed

src/crypto/cipher/benchmark_test.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ func BenchmarkAESGCM(b *testing.B) {
6565
}
6666
}
6767

68-
func benchmarkAESStream(b *testing.B, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) {
68+
func benchmarkAESStream(b *testing.B, mode func(cipher.Block, []byte) cipher.Stream, buf []byte, keySize int) {
6969
b.SetBytes(int64(len(buf)))
7070

71-
var key [16]byte
71+
key := make([]byte, keySize)
7272
var iv [16]byte
73-
aes, _ := aes.NewCipher(key[:])
73+
aes, _ := aes.NewCipher(key)
7474
stream := mode(aes, iv[:])
7575

7676
b.ResetTimer()
@@ -87,15 +87,20 @@ const almost1K = 1024 - 5
8787
const almost8K = 8*1024 - 5
8888

8989
func BenchmarkAESCTR(b *testing.B) {
90-
b.Run("50", func(b *testing.B) {
91-
benchmarkAESStream(b, cipher.NewCTR, make([]byte, 50))
92-
})
93-
b.Run("1K", func(b *testing.B) {
94-
benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost1K))
95-
})
96-
b.Run("8K", func(b *testing.B) {
97-
benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost8K))
98-
})
90+
for _, keyBits := range []int{128, 192, 256} {
91+
keySize := keyBits / 8
92+
b.Run(strconv.Itoa(keyBits), func(b *testing.B) {
93+
b.Run("50", func(b *testing.B) {
94+
benchmarkAESStream(b, cipher.NewCTR, make([]byte, 50), keySize)
95+
})
96+
b.Run("1K", func(b *testing.B) {
97+
benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost1K), keySize)
98+
})
99+
b.Run("8K", func(b *testing.B) {
100+
benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost8K), keySize)
101+
})
102+
})
103+
}
99104
}
100105

101106
func BenchmarkAESCBCEncrypt1K(b *testing.B) {

src/crypto/cipher/ctr_aes_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"crypto/internal/boring"
1818
"crypto/internal/cryptotest"
1919
fipsaes "crypto/internal/fips140/aes"
20+
"encoding/binary"
2021
"encoding/hex"
2122
"fmt"
2223
"math/rand"
@@ -117,6 +118,60 @@ func makeTestingCiphers(aesBlock cipher.Block, iv []byte) (genericCtr, multibloc
117118
return cipher.NewCTR(wrap(aesBlock), iv), cipher.NewCTR(aesBlock, iv)
118119
}
119120

121+
// TestCTR_AES_blocks8FastPathMatchesGeneric ensures the overlow aware branch
122+
// produces identical keystreams to the generic counter walker across
123+
// representative IVs, including near-overflow cases.
124+
func TestCTR_AES_blocks8FastPathMatchesGeneric(t *testing.T) {
125+
key := make([]byte, aes.BlockSize)
126+
block, err := aes.NewCipher(key)
127+
if err != nil {
128+
t.Fatal(err)
129+
}
130+
if _, ok := block.(*fipsaes.Block); !ok {
131+
t.Skip("requires crypto/internal/fips140/aes")
132+
}
133+
134+
keystream := make([]byte, 8*aes.BlockSize)
135+
136+
testCases := []struct {
137+
name string
138+
hi uint64
139+
lo uint64
140+
}{
141+
{"Zero", 0, 0},
142+
{"NearOverflowMinus7", 1, ^uint64(0) - 7},
143+
{"NearOverflowMinus6", 2, ^uint64(0) - 6},
144+
{"Overflow", 0, ^uint64(0)},
145+
}
146+
147+
for _, tc := range testCases {
148+
t.Run(tc.name, func(t *testing.T) {
149+
var iv [aes.BlockSize]byte
150+
binary.BigEndian.PutUint64(iv[0:8], tc.hi)
151+
binary.BigEndian.PutUint64(iv[8:], tc.lo)
152+
153+
generic, multiblock := makeTestingCiphers(block, iv[:])
154+
155+
genericOut := make([]byte, len(keystream))
156+
multiblockOut := make([]byte, len(keystream))
157+
158+
generic.XORKeyStream(genericOut, keystream)
159+
multiblock.XORKeyStream(multiblockOut, keystream)
160+
161+
if !bytes.Equal(multiblockOut, genericOut) {
162+
t.Fatalf("mismatch for iv %#x:%#x\n"+
163+
"asm keystream: %x\n"+
164+
"gen keystream: %x\n"+
165+
"asm counters: %x\n"+
166+
"gen counters: %x",
167+
tc.hi, tc.lo, multiblockOut, genericOut,
168+
extractCounters(block, multiblockOut),
169+
extractCounters(block, genericOut))
170+
}
171+
})
172+
}
173+
}
174+
120175
func randBytes(t *testing.T, r *rand.Rand, count int) []byte {
121176
t.Helper()
122177
buf := make([]byte, count)
@@ -297,3 +352,12 @@ func TestCTR_AES_multiblock_XORKeyStreamAt(t *testing.T) {
297352
})
298353
}
299354
}
355+
356+
func extractCounters(block cipher.Block, keystream []byte) []byte {
357+
blockSize := block.BlockSize()
358+
res := make([]byte, len(keystream))
359+
for i := 0; i < len(keystream); i += blockSize {
360+
block.Decrypt(res[i:i+blockSize], keystream[i:i+blockSize])
361+
}
362+
return res
363+
}

src/crypto/internal/fips140/aes/_asm/ctr/ctr_amd64_asm.go

Lines changed: 72 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,19 +40,79 @@ func ctrBlocks(numBlocks int) {
4040
bswap := XMM()
4141
MOVOU(bswapMask(), bswap)
4242

43-
blocks := make([]VecVirtual, 0, numBlocks)
43+
blocks := make([]VecVirtual, numBlocks)
44+
45+
// For the 8-block case we optimize counter generation. We build the first
46+
// counter as usual, then check whether the remaining seven increments will
47+
// overflow. When they do not (the common case) we keep the work entirely in
48+
// XMM registers to avoid expensive general-purpose -> XMM moves. Otherwise
49+
// we fall back to the traditional scalar path.
50+
if numBlocks == 8 {
51+
for i := range blocks {
52+
blocks[i] = XMM()
53+
}
4454

45-
// Lay out counter block plaintext.
46-
for i := 0; i < numBlocks; i++ {
47-
x := XMM()
48-
blocks = append(blocks, x)
49-
50-
MOVQ(ivlo, x)
51-
PINSRQ(Imm(1), ivhi, x)
52-
PSHUFB(bswap, x)
53-
if i < numBlocks-1 {
54-
ADDQ(Imm(1), ivlo)
55-
ADCQ(Imm(0), ivhi)
55+
base := XMM()
56+
tmp := GP64()
57+
addVec := XMM()
58+
59+
MOVQ(ivlo, blocks[0])
60+
PINSRQ(Imm(1), ivhi, blocks[0])
61+
MOVAPS(blocks[0], base)
62+
PSHUFB(bswap, blocks[0])
63+
64+
// Check whether any of these eight counters will overflow.
65+
MOVQ(ivlo, tmp)
66+
ADDQ(Imm(uint64(numBlocks-1)), tmp)
67+
slowLabel := fmt.Sprintf("ctr%d_slow", numBlocks)
68+
doneLabel := fmt.Sprintf("ctr%d_done", numBlocks)
69+
JC(LabelRef(slowLabel))
70+
71+
// Fast branch: create an XMM increment vector containing the value 1.
72+
// Adding it to the base counter yields each subsequent counter.
73+
XORQ(tmp, tmp)
74+
INCQ(tmp)
75+
PXOR(addVec, addVec)
76+
PINSRQ(Imm(0), tmp, addVec)
77+
78+
for i := 1; i < numBlocks; i++ {
79+
PADDQ(addVec, base)
80+
MOVAPS(base, blocks[i])
81+
}
82+
JMP(LabelRef(doneLabel))
83+
84+
Label(slowLabel)
85+
ADDQ(Imm(1), ivlo)
86+
ADCQ(Imm(0), ivhi)
87+
for i := 1; i < numBlocks; i++ {
88+
MOVQ(ivlo, blocks[i])
89+
PINSRQ(Imm(1), ivhi, blocks[i])
90+
if i < numBlocks-1 {
91+
ADDQ(Imm(1), ivlo)
92+
ADCQ(Imm(0), ivhi)
93+
}
94+
}
95+
96+
Label(doneLabel)
97+
98+
// Convert little-endian counters to big-endian after the branch since
99+
// both paths share the same shuffle sequence.
100+
for i := 1; i < numBlocks; i++ {
101+
PSHUFB(bswap, blocks[i])
102+
}
103+
} else {
104+
// Lay out counter block plaintext.
105+
for i := 0; i < numBlocks; i++ {
106+
x := XMM()
107+
blocks[i] = x
108+
109+
MOVQ(ivlo, x)
110+
PINSRQ(Imm(1), ivhi, x)
111+
PSHUFB(bswap, x)
112+
if i < numBlocks-1 {
113+
ADDQ(Imm(1), ivlo)
114+
ADCQ(Imm(0), ivhi)
115+
}
56116
}
57117
}
58118

src/crypto/internal/fips140/aes/ctr_amd64.s

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,41 +286,68 @@ TEXT ·ctrBlocks8Asm(SB), $0-48
286286
MOVOU bswapMask<>+0(SB), X0
287287
MOVQ SI, X1
288288
PINSRQ $0x01, DI, X1
289+
MOVAPS X1, X8
289290
PSHUFB X0, X1
291+
MOVQ SI, R8
292+
ADDQ $0x07, R8
293+
JC ctr8_slow
294+
XORQ R8, R8
295+
INCQ R8
296+
PXOR X9, X9
297+
PINSRQ $0x00, R8, X9
298+
PADDQ X9, X8
299+
MOVAPS X8, X2
300+
PADDQ X9, X8
301+
MOVAPS X8, X3
302+
PADDQ X9, X8
303+
MOVAPS X8, X4
304+
PADDQ X9, X8
305+
MOVAPS X8, X5
306+
PADDQ X9, X8
307+
MOVAPS X8, X6
308+
PADDQ X9, X8
309+
MOVAPS X8, X7
310+
PADDQ X9, X8
311+
MOVAPS X8, X8
312+
JMP ctr8_done
313+
314+
ctr8_slow:
290315
ADDQ $0x01, SI
291316
ADCQ $0x00, DI
292317
MOVQ SI, X2
293318
PINSRQ $0x01, DI, X2
294-
PSHUFB X0, X2
295319
ADDQ $0x01, SI
296320
ADCQ $0x00, DI
297321
MOVQ SI, X3
298322
PINSRQ $0x01, DI, X3
299-
PSHUFB X0, X3
300323
ADDQ $0x01, SI
301324
ADCQ $0x00, DI
302325
MOVQ SI, X4
303326
PINSRQ $0x01, DI, X4
304-
PSHUFB X0, X4
305327
ADDQ $0x01, SI
306328
ADCQ $0x00, DI
307329
MOVQ SI, X5
308330
PINSRQ $0x01, DI, X5
309-
PSHUFB X0, X5
310331
ADDQ $0x01, SI
311332
ADCQ $0x00, DI
312333
MOVQ SI, X6
313334
PINSRQ $0x01, DI, X6
314-
PSHUFB X0, X6
315335
ADDQ $0x01, SI
316336
ADCQ $0x00, DI
317337
MOVQ SI, X7
318338
PINSRQ $0x01, DI, X7
319-
PSHUFB X0, X7
320339
ADDQ $0x01, SI
321340
ADCQ $0x00, DI
322341
MOVQ SI, X8
323342
PINSRQ $0x01, DI, X8
343+
344+
ctr8_done:
345+
PSHUFB X0, X2
346+
PSHUFB X0, X3
347+
PSHUFB X0, X4
348+
PSHUFB X0, X5
349+
PSHUFB X0, X6
350+
PSHUFB X0, X7
324351
PSHUFB X0, X8
325352
MOVUPS (CX), X0
326353
PXOR X0, X1

0 commit comments

Comments
 (0)