14
14
// limitations under the License.
15
15
//
16
16
17
- //+build ignore
18
-
19
17
//+build !noasm,!appengine
20
18
21
19
// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
22
20
// the opcodes of their Plan9 equivalents
23
21
22
+ #include "textflag.h"
23
+
24
24
#define REDUCE_MOD(x0, x1, x2, x3, tmp0, tmp1, y0, y1) \
25
25
MOVD $0x3FFFFFFFFFFFFFFF , tmp0 \
26
26
AND tmp0, x3 \
50
50
EOR x1, y1 \
51
51
EOR x3, y1
52
52
53
- #define UPDATE(MSG1, MSG2) \
53
+ #define UPDATE(MSG1, MSG2) \
54
54
\ // Add message
55
- VADD MSG1.D2, V2.D2, V2.D2 \
56
- VADD MSG2.D2, V3.D2, V3.D2 \
57
- \
55
+ VADD MSG1.D2, V2.D2, V2.D2 \
56
+ VADD MSG2.D2, V3.D2, V3.D2 \
57
+ \
58
58
\ // v1 += mul0
59
- VADD V4.D2, V2.D2, V2.D2 \
60
- VADD V5.D2, V3.D2, V3.D2 \
61
- \
59
+ VADD V4.D2, V2.D2, V2.D2 \
60
+ VADD V5.D2, V3.D2, V3.D2 \
61
+ \
62
62
\ // First pair of multiplies
63
- VTBL V29.B16, [V0.B16, V1.B16], V10.B16 \
64
- VTBL V30.B16, [V2.B16, V3.B16], V11.B16 \
65
- \
66
- \ // VUMULL V10.S2, V11.S2, V12.D2 /* assembler support missing */
67
- \ // VUMULL2 V10.S4, V11.S4, V13.D2 /* assembler support missing */
68
- WORD $0x2eaac16c \ // umull v12.2d, v11.2s, v10.2s
69
- WORD $0x6eaac16d \ // umull2 v13.2d, v11.4s, v10.4s
70
- \
63
+ VTBL V29.B16, [V0.B16, V1.B16], V10.B16 \
64
+ VTBL V30.B16, [V2.B16, V3.B16], V11.B16 \
65
+ \
66
+ \ // VUMULL V10.S2, V11.S2, V12.D2 /* assembler support missing */
67
+ \ // VUMULL2 V10.S4, V11.S4, V13.D2 /* assembler support missing */
68
+ WORD $0x2eaac16c \ // umull v12.2d, v11.2s, v10.2s
69
+ WORD $0x6eaac16d \ // umull2 v13.2d, v11.4s, v10.4s
70
+ \
71
71
\ // v0 += mul1
72
- VADD V6.D2, V0.D2, V0.D2 \
73
- VADD V7.D2, V1.D2, V1.D2 \
74
- \
72
+ VADD V6.D2, V0.D2, V0.D2 \
73
+ VADD V7.D2, V1.D2, V1.D2 \
74
+ \
75
75
\ // Second pair of multiplies
76
- VTBL V29.B16, [V2.B16, V3.B16], V15.B16 \
77
- VTBL V30.B16, [V0.B16, V1.B16], V14.B16 \
78
- \
76
+ VTBL V29.B16, [V2.B16, V3.B16], V15.B16 \
77
+ VTBL V30.B16, [V0.B16, V1.B16], V14.B16 \
78
+ \
79
79
\ // EOR multiplication result in
80
- VEOR V12.B16, V4.B16, V4.B16 \
81
- VEOR V13.B16, V5.B16, V5.B16 \
82
- \
80
+ VEOR V12.B16, V4.B16, V4.B16 \
81
+ VEOR V13.B16, V5.B16, V5.B16 \
82
+ \
83
83
\ // VUMULL V14.S2, V15.S2, V16.D2 /* assembler support missing */
84
84
\ // VUMULL2 V14.S4, V15.S4, V17.D2 /* assembler support missing */
85
- WORD $0x2eaec1f0 \ // umull v16.2d, v15.2s, v14.2s
86
- WORD $0x6eaec1f1 \ // umull2 v17.2d, v15.4s, v14.4s
87
- \
85
+ WORD $0x2eaec1f0 \ // umull v16.2d, v15.2s, v14.2s
86
+ WORD $0x6eaec1f1 \ // umull2 v17.2d, v15.4s, v14.4s
87
+ \
88
88
\ // First pair of zipper-merges
89
- VTBL V28.B16, [V2.B16], V18.B16 \
90
- VADD V18.D2, V0.D2, V0.D2 \
91
- VTBL V28.B16, [V3.B16], V19.B16 \
92
- VADD V19.D2, V1.D2, V1.D2 \
93
- \
89
+ VTBL V28.B16, [V2.B16], V18.B16 \
90
+ VADD V18.D2, V0.D2, V0.D2 \
91
+ VTBL V28.B16, [V3.B16], V19.B16 \
92
+ VADD V19.D2, V1.D2, V1.D2 \
93
+ \
94
94
\ // Second pair of zipper-merges
95
- VTBL V28.B16, [V0.B16], V20.B16 \
96
- VADD V20.D2, V2.D2, V2.D2 \
97
- VTBL V28.B16, [V1.B16], V21.B16 \
98
- VADD V21.D2, V3.D2, V3.D2 \
99
- \
95
+ VTBL V28.B16, [V0.B16], V20.B16 \
96
+ VADD V20.D2, V2.D2, V2.D2 \
97
+ VTBL V28.B16, [V1.B16], V21.B16 \
98
+ VADD V21.D2, V3.D2, V3.D2 \
99
+ \
100
100
\ // EOR multiplication result in
101
- VEOR V16.B16, V6.B16, V6.B16 \
101
+ VEOR V16.B16, V6.B16, V6.B16 \
102
102
VEOR V17.B16, V7.B16, V7.B16
103
103
104
-
105
104
// func initializeArm64(state *[16]uint64, key []byte)
106
- TEXT ·initializeArm64(SB), 7 , $0
105
+ TEXT ·initializeArm64(SB), NOSPLIT , $0
107
106
MOVD state+0 (FP), R0
108
107
MOVD key_base+8 (FP), R1
109
108
@@ -112,7 +111,7 @@ TEXT ·initializeArm64(SB), 7, $0
112
111
VREV64 V1.S4, V3.S4
113
112
VREV64 V2.S4, V4.S4
114
113
115
- MOVD $·constants (SB), R3
114
+ MOVD $·asmConstants (SB), R3
116
115
VLD1 (R3), [V5.S4, V6.S4, V7.S4, V8.S4]
117
116
VEOR V5.B16, V1.B16, V1.B16
118
117
VEOR V6.B16, V2.B16, V2.B16
@@ -123,8 +122,7 @@ TEXT ·initializeArm64(SB), 7, $0
123
122
VST1 [V5.D2, V6.D2, V7.D2, V8.D2], (R0)
124
123
RET
125
124
126
-
127
- TEXT ·updateArm64(SB), 7 , $0
125
+ TEXT ·updateArm64(SB), NOSPLIT, $0
128
126
MOVD state+0 (FP), R0
129
127
MOVD msg_base+8 (FP), R1
130
128
MOVD msg_len+16 (FP), R2 // length of message
@@ -142,7 +140,7 @@ TEXT ·updateArm64(SB), 7, $0
142
140
// v7 = mul1.hi
143
141
144
142
// Load zipper merge constants table pointer
145
- MOVD $·zipperMerge (SB), R3
143
+ MOVD $·asmZipperMerge (SB), R3
146
144
147
145
// and load zipper merge constants into v28, v29, and v30
148
146
VLD1 (R3), [V28.B16, V29.B16, V30.B16]
@@ -167,15 +165,14 @@ loop:
167
165
complete:
168
166
RET
169
167
170
-
171
168
// func finalizeArm64(out []byte, state *[16]uint64)
172
- TEXT ·finalizeArm64(SB), 4 , $0 -32
169
+ TEXT ·finalizeArm64(SB), NOSPLIT , $0 -32
173
170
MOVD state+24 (FP), R0
174
171
MOVD out_base+0 (FP), R1
175
172
MOVD out_len+8 (FP), R2
176
173
177
174
// Load zipper merge constants table pointer
178
- MOVD $·zipperMerge (SB), R3
175
+ MOVD $·asmZipperMerge (SB), R3
179
176
180
177
// and load zipper merge constants into v28, v29, and v30
181
178
VLD1 (R3), [V28.B16, V29.B16, V30.B16]
@@ -200,8 +197,8 @@ TEXT ·finalizeArm64(SB), 4, $0-32
200
197
VREV64 V0.S4, V27.S4
201
198
UPDATE(V26, V27)
202
199
203
- CMP $8 , R2
204
- BEQ skipUpdate // Just 4 rounds for 64-bit checksum
200
+ CMP $8 , R2
201
+ BEQ skipUpdate // Just 4 rounds for 64-bit checksum
205
202
206
203
VREV64 V1.S4, V26.S4
207
204
VREV64 V0.S4, V27.S4
@@ -211,8 +208,8 @@ TEXT ·finalizeArm64(SB), 4, $0-32
211
208
VREV64 V0.S4, V27.S4
212
209
UPDATE(V26, V27)
213
210
214
- CMP $16 , R2
215
- BEQ skipUpdate // 6 rounds for 128-bit checksum
211
+ CMP $16 , R2
212
+ BEQ skipUpdate // 6 rounds for 128-bit checksum
216
213
217
214
VREV64 V1.S4, V26.S4
218
215
VREV64 V0.S4, V27.S4
@@ -282,16 +279,16 @@ hash128:
282
279
MOVD 1*8 (R0), R9
283
280
MOVD 6*8 (R0), R10
284
281
MOVD 7*8 (R0), R11
285
- ADD R10, R8
286
- ADD R11, R9
282
+ ADD R10, R8
283
+ ADD R11, R9
287
284
MOVD 8*8 (R0), R10
288
285
MOVD 9*8 (R0), R11
289
- ADD R10, R8
290
- ADD R11, R9
286
+ ADD R10, R8
287
+ ADD R11, R9
291
288
MOVD 14*8 (R0), R10
292
289
MOVD 15*8 (R0), R11
293
- ADD R10, R8
294
- ADD R11, R9
290
+ ADD R10, R8
291
+ ADD R11, R9
295
292
MOVD R8, 0 (R1)
296
293
MOVD R9, 8 (R1)
297
294
RET
@@ -307,22 +304,21 @@ hash64:
307
304
MOVD R4, (R1)
308
305
RET
309
306
310
-
311
- DATA ·constants+0x00 (SB)/8 , $0xdbe6d5d5fe4cce2f
312
- DATA ·constants+0x08 (SB)/8 , $0xa4093822299f31d0
313
- DATA ·constants+0x10 (SB)/8 , $0x13198a2e03707344
314
- DATA ·constants+0x18 (SB)/8 , $0x243f6a8885a308d3
315
- DATA ·constants+0x20 (SB)/8 , $0x3bd39e10cb0ef593
316
- DATA ·constants+0x28 (SB)/8 , $0xc0acf169b5f18a8c
317
- DATA ·constants+0x30 (SB)/8 , $0xbe5466cf34e90c6c
318
- DATA ·constants+0x38 (SB)/8 , $0x452821e638d01377
319
- GLOBL ·constants(SB), 8 , $64
307
+ DATA ·asmConstants+0x00 (SB)/8 , $0xdbe6d5d5fe4cce2f
308
+ DATA ·asmConstants+0x08 (SB)/8 , $0xa4093822299f31d0
309
+ DATA ·asmConstants+0x10 (SB)/8 , $0x13198a2e03707344
310
+ DATA ·asmConstants+0x18 (SB)/8 , $0x243f6a8885a308d3
311
+ DATA ·asmConstants+0x20 (SB)/8 , $0x3bd39e10cb0ef593
312
+ DATA ·asmConstants+0x28 (SB)/8 , $0xc0acf169b5f18a8c
313
+ DATA ·asmConstants+0x30 (SB)/8 , $0xbe5466cf34e90c6c
314
+ DATA ·asmConstants+0x38 (SB)/8 , $0x452821e638d01377
315
+ GLOBL ·asmConstants(SB), 8 , $64
320
316
321
317
// Constants for TBL instructions
322
- DATA ·zipperMerge +0x0 (SB)/8 , $0x000f010e05020c03 // zipper merge constant
323
- DATA ·zipperMerge +0x8 (SB)/8 , $0x070806090d0a040b
324
- DATA ·zipperMerge +0x10 (SB)/8 , $0x0f0e0d0c07060504 // setup first register for multiply
325
- DATA ·zipperMerge +0x18 (SB)/8 , $0x1f1e1d1c17161514
326
- DATA ·zipperMerge +0x20 (SB)/8 , $0x0b0a090803020100 // setup second register for multiply
327
- DATA ·zipperMerge +0x28 (SB)/8 , $0x1b1a191813121110
328
- GLOBL ·zipperMerge (SB), 8 , $48
318
+ DATA ·asmZipperMerge +0x0 (SB)/8 , $0x000f010e05020c03 // zipper merge constant
319
+ DATA ·asmZipperMerge +0x8 (SB)/8 , $0x070806090d0a040b
320
+ DATA ·asmZipperMerge +0x10 (SB)/8 , $0x0f0e0d0c07060504 // setup first register for multiply
321
+ DATA ·asmZipperMerge +0x18 (SB)/8 , $0x1f1e1d1c17161514
322
+ DATA ·asmZipperMerge +0x20 (SB)/8 , $0x0b0a090803020100 // setup second register for multiply
323
+ DATA ·asmZipperMerge +0x28 (SB)/8 , $0x1b1a191813121110
324
+ GLOBL ·asmZipperMerge (SB), 8 , $48
0 commit comments