Skip to content

Commit 08ce0b4

Browse files
authored
Fix ARM64 assembly (#19)
The wrong constants were used for ARM64, leading to wrong values being calculated. This is likely due to a linker change or similar. * Rename to less generic names. * Use textflag.h * Apply asmfmt Fixes #17 * Re-enable asm...
1 parent 5311fe9 commit 08ce0b4

5 files changed

+99
-105
lines changed

highwayhash_amd64.s

+19-19
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,19 @@
66

77
#include "textflag.h"
88

9-
DATA ·cons<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
10-
DATA ·cons<>+0x08(SB)/8, $0xa4093822299f31d0
11-
DATA ·cons<>+0x10(SB)/8, $0x13198a2e03707344
12-
DATA ·cons<>+0x18(SB)/8, $0x243f6a8885a308d3
13-
DATA ·cons<>+0x20(SB)/8, $0x3bd39e10cb0ef593
14-
DATA ·cons<>+0x28(SB)/8, $0xc0acf169b5f18a8c
15-
DATA ·cons<>+0x30(SB)/8, $0xbe5466cf34e90c6c
16-
DATA ·cons<>+0x38(SB)/8, $0x452821e638d01377
17-
GLOBL ·cons<>(SB), (NOPTR+RODATA), $64
18-
19-
DATA ·zipperMerge<>+0x00(SB)/8, $0xf010e05020c03
20-
DATA ·zipperMerge<>+0x08(SB)/8, $0x70806090d0a040b
21-
GLOBL ·zipperMerge<>(SB), (NOPTR+RODATA), $16
9+
DATA ·asmConstants<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
10+
DATA ·asmConstants<>+0x08(SB)/8, $0xa4093822299f31d0
11+
DATA ·asmConstants<>+0x10(SB)/8, $0x13198a2e03707344
12+
DATA ·asmConstants<>+0x18(SB)/8, $0x243f6a8885a308d3
13+
DATA ·asmConstants<>+0x20(SB)/8, $0x3bd39e10cb0ef593
14+
DATA ·asmConstants<>+0x28(SB)/8, $0xc0acf169b5f18a8c
15+
DATA ·asmConstants<>+0x30(SB)/8, $0xbe5466cf34e90c6c
16+
DATA ·asmConstants<>+0x38(SB)/8, $0x452821e638d01377
17+
GLOBL ·asmConstants<>(SB), (NOPTR+RODATA), $64
18+
19+
DATA ·asmZipperMerge<>+0x00(SB)/8, $0xf010e05020c03
20+
DATA ·asmZipperMerge<>+0x08(SB)/8, $0x70806090d0a040b
21+
GLOBL ·asmZipperMerge<>(SB), (NOPTR+RODATA), $16
2222

2323
#define v00 X0
2424
#define v01 X1
@@ -104,10 +104,10 @@ GLOBL ·zipperMerge<>(SB), (NOPTR+RODATA), $16
104104
PADDQ t1, v11
105105

106106
// func initializeSSE4(state *[16]uint64, key []byte)
107-
TEXT ·initializeSSE4(SB), 4, $0-32
107+
TEXT ·initializeSSE4(SB), NOSPLIT, $0-32
108108
MOVQ state+0(FP), AX
109109
MOVQ key_base+8(FP), BX
110-
MOVQcons<>(SB), CX
110+
MOVQasmConstants<>(SB), CX
111111

112112
MOVOU 0(BX), v00
113113
MOVOU 16(BX), v01
@@ -136,7 +136,7 @@ TEXT ·initializeSSE4(SB), 4, $0-32
136136
RET
137137

138138
// func updateSSE4(state *[16]uint64, msg []byte)
139-
TEXT ·updateSSE4(SB), 4, $0-32
139+
TEXT ·updateSSE4(SB), NOSPLIT, $0-32
140140
MOVQ state+0(FP), AX
141141
MOVQ msg_base+8(FP), BX
142142
MOVQ msg_len+16(FP), CX
@@ -153,7 +153,7 @@ TEXT ·updateSSE4(SB), 4, $0-32
153153
MOVOU 96(AX), m10
154154
MOVOU 112(AX), m11
155155

156-
MOVOU ·zipperMerge<>(SB), t2
156+
MOVOU ·asmZipperMerge<>(SB), t2
157157

158158
LOOP:
159159
MOVOU 0(BX), t0
@@ -178,7 +178,7 @@ DONE:
178178
RET
179179

180180
// func finalizeSSE4(out []byte, state *[16]uint64)
181-
TEXT ·finalizeSSE4(SB), 4, $0-32
181+
TEXT ·finalizeSSE4(SB), NOSPLIT, $0-32
182182
MOVQ state+24(FP), AX
183183
MOVQ out_base+0(FP), BX
184184
MOVQ out_len+8(FP), CX
@@ -192,7 +192,7 @@ TEXT ·finalizeSSE4(SB), 4, $0-32
192192
MOVOU 96(AX), m10
193193
MOVOU 112(AX), m11
194194

195-
MOVOU ·zipperMerge<>(SB), t2
195+
MOVOU ·asmZipperMerge<>(SB), t2
196196

197197
PSHUFD $177, v01, t0
198198
PSHUFD $177, v00, t1

highwayhash_arm64.go

-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
// Use of this source code is governed by a license that can be
33
// found in the LICENSE file.
44

5-
//+build ignore
6-
75
//+build !noasm,!appengine
86

97
package highwayhash

highwayhash_arm64.s

+72-76
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
// limitations under the License.
1515
//
1616

17-
//+build ignore
18-
1917
//+build !noasm,!appengine
2018

2119
// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
2220
// the opcodes of their Plan9 equivalents
2321

22+
#include "textflag.h"
23+
2424
#define REDUCE_MOD(x0, x1, x2, x3, tmp0, tmp1, y0, y1) \
2525
MOVD $0x3FFFFFFFFFFFFFFF, tmp0 \
2626
AND tmp0, x3 \
@@ -50,60 +50,59 @@
5050
EOR x1, y1 \
5151
EOR x3, y1
5252

53-
#define UPDATE(MSG1, MSG2) \
53+
#define UPDATE(MSG1, MSG2) \
5454
\ // Add message
55-
VADD MSG1.D2, V2.D2, V2.D2 \
56-
VADD MSG2.D2, V3.D2, V3.D2 \
57-
\
55+
VADD MSG1.D2, V2.D2, V2.D2 \
56+
VADD MSG2.D2, V3.D2, V3.D2 \
57+
\
5858
\ // v1 += mul0
59-
VADD V4.D2, V2.D2, V2.D2 \
60-
VADD V5.D2, V3.D2, V3.D2 \
61-
\
59+
VADD V4.D2, V2.D2, V2.D2 \
60+
VADD V5.D2, V3.D2, V3.D2 \
61+
\
6262
\ // First pair of multiplies
63-
VTBL V29.B16, [V0.B16, V1.B16], V10.B16 \
64-
VTBL V30.B16, [V2.B16, V3.B16], V11.B16 \
65-
\
66-
\ // VUMULL V10.S2, V11.S2, V12.D2 /* assembler support missing */
67-
\ // VUMULL2 V10.S4, V11.S4, V13.D2 /* assembler support missing */
68-
WORD $0x2eaac16c \ // umull v12.2d, v11.2s, v10.2s
69-
WORD $0x6eaac16d \ // umull2 v13.2d, v11.4s, v10.4s
70-
\
63+
VTBL V29.B16, [V0.B16, V1.B16], V10.B16 \
64+
VTBL V30.B16, [V2.B16, V3.B16], V11.B16 \
65+
\
66+
\ // VUMULL V10.S2, V11.S2, V12.D2 /* assembler support missing */
67+
\ // VUMULL2 V10.S4, V11.S4, V13.D2 /* assembler support missing */
68+
WORD $0x2eaac16c \ // umull v12.2d, v11.2s, v10.2s
69+
WORD $0x6eaac16d \ // umull2 v13.2d, v11.4s, v10.4s
70+
\
7171
\ // v0 += mul1
72-
VADD V6.D2, V0.D2, V0.D2 \
73-
VADD V7.D2, V1.D2, V1.D2 \
74-
\
72+
VADD V6.D2, V0.D2, V0.D2 \
73+
VADD V7.D2, V1.D2, V1.D2 \
74+
\
7575
\ // Second pair of multiplies
76-
VTBL V29.B16, [V2.B16, V3.B16], V15.B16 \
77-
VTBL V30.B16, [V0.B16, V1.B16], V14.B16 \
78-
\
76+
VTBL V29.B16, [V2.B16, V3.B16], V15.B16 \
77+
VTBL V30.B16, [V0.B16, V1.B16], V14.B16 \
78+
\
7979
\ // EOR multiplication result in
80-
VEOR V12.B16, V4.B16, V4.B16 \
81-
VEOR V13.B16, V5.B16, V5.B16 \
82-
\
80+
VEOR V12.B16, V4.B16, V4.B16 \
81+
VEOR V13.B16, V5.B16, V5.B16 \
82+
\
8383
\ // VUMULL V14.S2, V15.S2, V16.D2 /* assembler support missing */
8484
\ // VUMULL2 V14.S4, V15.S4, V17.D2 /* assembler support missing */
85-
WORD $0x2eaec1f0 \ // umull v16.2d, v15.2s, v14.2s
86-
WORD $0x6eaec1f1 \ // umull2 v17.2d, v15.4s, v14.4s
87-
\
85+
WORD $0x2eaec1f0 \ // umull v16.2d, v15.2s, v14.2s
86+
WORD $0x6eaec1f1 \ // umull2 v17.2d, v15.4s, v14.4s
87+
\
8888
\ // First pair of zipper-merges
89-
VTBL V28.B16, [V2.B16], V18.B16 \
90-
VADD V18.D2, V0.D2, V0.D2 \
91-
VTBL V28.B16, [V3.B16], V19.B16 \
92-
VADD V19.D2, V1.D2, V1.D2 \
93-
\
89+
VTBL V28.B16, [V2.B16], V18.B16 \
90+
VADD V18.D2, V0.D2, V0.D2 \
91+
VTBL V28.B16, [V3.B16], V19.B16 \
92+
VADD V19.D2, V1.D2, V1.D2 \
93+
\
9494
\ // Second pair of zipper-merges
95-
VTBL V28.B16, [V0.B16], V20.B16 \
96-
VADD V20.D2, V2.D2, V2.D2 \
97-
VTBL V28.B16, [V1.B16], V21.B16 \
98-
VADD V21.D2, V3.D2, V3.D2 \
99-
\
95+
VTBL V28.B16, [V0.B16], V20.B16 \
96+
VADD V20.D2, V2.D2, V2.D2 \
97+
VTBL V28.B16, [V1.B16], V21.B16 \
98+
VADD V21.D2, V3.D2, V3.D2 \
99+
\
100100
\ // EOR multiplication result in
101-
VEOR V16.B16, V6.B16, V6.B16 \
101+
VEOR V16.B16, V6.B16, V6.B16 \
102102
VEOR V17.B16, V7.B16, V7.B16
103103

104-
105104
// func initializeArm64(state *[16]uint64, key []byte)
106-
TEXT ·initializeArm64(SB), 7, $0
105+
TEXT ·initializeArm64(SB), NOSPLIT, $0
107106
MOVD state+0(FP), R0
108107
MOVD key_base+8(FP), R1
109108

@@ -112,7 +111,7 @@ TEXT ·initializeArm64(SB), 7, $0
112111
VREV64 V1.S4, V3.S4
113112
VREV64 V2.S4, V4.S4
114113

115-
MOVDconstants(SB), R3
114+
MOVDasmConstants(SB), R3
116115
VLD1 (R3), [V5.S4, V6.S4, V7.S4, V8.S4]
117116
VEOR V5.B16, V1.B16, V1.B16
118117
VEOR V6.B16, V2.B16, V2.B16
@@ -123,8 +122,7 @@ TEXT ·initializeArm64(SB), 7, $0
123122
VST1 [V5.D2, V6.D2, V7.D2, V8.D2], (R0)
124123
RET
125124

126-
127-
TEXT ·updateArm64(SB), 7, $0
125+
TEXT ·updateArm64(SB), NOSPLIT, $0
128126
MOVD state+0(FP), R0
129127
MOVD msg_base+8(FP), R1
130128
MOVD msg_len+16(FP), R2 // length of message
@@ -142,7 +140,7 @@ TEXT ·updateArm64(SB), 7, $0
142140
// v7 = mul1.hi
143141

144142
// Load zipper merge constants table pointer
145-
MOVDzipperMerge(SB), R3
143+
MOVDasmZipperMerge(SB), R3
146144

147145
// and load zipper merge constants into v28, v29, and v30
148146
VLD1 (R3), [V28.B16, V29.B16, V30.B16]
@@ -167,15 +165,14 @@ loop:
167165
complete:
168166
RET
169167

170-
171168
// func finalizeArm64(out []byte, state *[16]uint64)
172-
TEXT ·finalizeArm64(SB), 4, $0-32
169+
TEXT ·finalizeArm64(SB), NOSPLIT, $0-32
173170
MOVD state+24(FP), R0
174171
MOVD out_base+0(FP), R1
175172
MOVD out_len+8(FP), R2
176173

177174
// Load zipper merge constants table pointer
178-
MOVDzipperMerge(SB), R3
175+
MOVDasmZipperMerge(SB), R3
179176

180177
// and load zipper merge constants into v28, v29, and v30
181178
VLD1 (R3), [V28.B16, V29.B16, V30.B16]
@@ -200,8 +197,8 @@ TEXT ·finalizeArm64(SB), 4, $0-32
200197
VREV64 V0.S4, V27.S4
201198
UPDATE(V26, V27)
202199

203-
CMP $8, R2
204-
BEQ skipUpdate // Just 4 rounds for 64-bit checksum
200+
CMP $8, R2
201+
BEQ skipUpdate // Just 4 rounds for 64-bit checksum
205202

206203
VREV64 V1.S4, V26.S4
207204
VREV64 V0.S4, V27.S4
@@ -211,8 +208,8 @@ TEXT ·finalizeArm64(SB), 4, $0-32
211208
VREV64 V0.S4, V27.S4
212209
UPDATE(V26, V27)
213210

214-
CMP $16, R2
215-
BEQ skipUpdate // 6 rounds for 128-bit checksum
211+
CMP $16, R2
212+
BEQ skipUpdate // 6 rounds for 128-bit checksum
216213

217214
VREV64 V1.S4, V26.S4
218215
VREV64 V0.S4, V27.S4
@@ -282,16 +279,16 @@ hash128:
282279
MOVD 1*8(R0), R9
283280
MOVD 6*8(R0), R10
284281
MOVD 7*8(R0), R11
285-
ADD R10, R8
286-
ADD R11, R9
282+
ADD R10, R8
283+
ADD R11, R9
287284
MOVD 8*8(R0), R10
288285
MOVD 9*8(R0), R11
289-
ADD R10, R8
290-
ADD R11, R9
286+
ADD R10, R8
287+
ADD R11, R9
291288
MOVD 14*8(R0), R10
292289
MOVD 15*8(R0), R11
293-
ADD R10, R8
294-
ADD R11, R9
290+
ADD R10, R8
291+
ADD R11, R9
295292
MOVD R8, 0(R1)
296293
MOVD R9, 8(R1)
297294
RET
@@ -307,22 +304,21 @@ hash64:
307304
MOVD R4, (R1)
308305
RET
309306

310-
311-
DATA ·constants+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
312-
DATA ·constants+0x08(SB)/8, $0xa4093822299f31d0
313-
DATA ·constants+0x10(SB)/8, $0x13198a2e03707344
314-
DATA ·constants+0x18(SB)/8, $0x243f6a8885a308d3
315-
DATA ·constants+0x20(SB)/8, $0x3bd39e10cb0ef593
316-
DATA ·constants+0x28(SB)/8, $0xc0acf169b5f18a8c
317-
DATA ·constants+0x30(SB)/8, $0xbe5466cf34e90c6c
318-
DATA ·constants+0x38(SB)/8, $0x452821e638d01377
319-
GLOBL ·constants(SB), 8, $64
307+
DATA ·asmConstants+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
308+
DATA ·asmConstants+0x08(SB)/8, $0xa4093822299f31d0
309+
DATA ·asmConstants+0x10(SB)/8, $0x13198a2e03707344
310+
DATA ·asmConstants+0x18(SB)/8, $0x243f6a8885a308d3
311+
DATA ·asmConstants+0x20(SB)/8, $0x3bd39e10cb0ef593
312+
DATA ·asmConstants+0x28(SB)/8, $0xc0acf169b5f18a8c
313+
DATA ·asmConstants+0x30(SB)/8, $0xbe5466cf34e90c6c
314+
DATA ·asmConstants+0x38(SB)/8, $0x452821e638d01377
315+
GLOBL ·asmConstants(SB), 8, $64
320316

321317
// Constants for TBL instructions
322-
DATA ·zipperMerge+0x0(SB)/8, $0x000f010e05020c03 // zipper merge constant
323-
DATA ·zipperMerge+0x8(SB)/8, $0x070806090d0a040b
324-
DATA ·zipperMerge+0x10(SB)/8, $0x0f0e0d0c07060504 // setup first register for multiply
325-
DATA ·zipperMerge+0x18(SB)/8, $0x1f1e1d1c17161514
326-
DATA ·zipperMerge+0x20(SB)/8, $0x0b0a090803020100 // setup second register for multiply
327-
DATA ·zipperMerge+0x28(SB)/8, $0x1b1a191813121110
328-
GLOBL ·zipperMerge(SB), 8, $48
318+
DATA ·asmZipperMerge+0x0(SB)/8, $0x000f010e05020c03 // zipper merge constant
319+
DATA ·asmZipperMerge+0x8(SB)/8, $0x070806090d0a040b
320+
DATA ·asmZipperMerge+0x10(SB)/8, $0x0f0e0d0c07060504 // setup first register for multiply
321+
DATA ·asmZipperMerge+0x18(SB)/8, $0x1f1e1d1c17161514
322+
DATA ·asmZipperMerge+0x20(SB)/8, $0x0b0a090803020100 // setup second register for multiply
323+
DATA ·asmZipperMerge+0x28(SB)/8, $0x1b1a191813121110
324+
GLOBL ·asmZipperMerge(SB), 8, $48

highwayhash_ppc64le.s

+7-7
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ TEXT ·updatePpc64Le(SB), NOFRAME|NOSPLIT, $0-32
108108
XXPERMDI MUL1_LO, MUL1_LO, $2, MUL1_LO
109109
XXPERMDI MUL1_HI, MUL1_HI, $2, MUL1_HI
110110

111-
// Load constants table pointer
112-
MOVDconstants(SB), CONSTANTS
111+
// Load asmConstants table pointer
112+
MOVDasmConstants(SB), CONSTANTS
113113
LXVD2X (CONSTANTS)(R0), ROTATE
114114
LXVD2X (CONSTANTS)(P1), MASK
115115
XXLNAND MASK, MASK, MASK
@@ -174,9 +174,9 @@ complete:
174174
RET
175175

176176
// Constants table
177-
DATA ·constants+0x0(SB)/8, $0x0000000000000020
178-
DATA ·constants+0x8(SB)/8, $0x0000000000000020
179-
DATA ·constants+0x10(SB)/8, $0x070806090d0a040b // zipper merge constant
180-
DATA ·constants+0x18(SB)/8, $0x000f010e05020c03 // zipper merge constant
177+
DATA ·asmConstants+0x0(SB)/8, $0x0000000000000020
178+
DATA ·asmConstants+0x8(SB)/8, $0x0000000000000020
179+
DATA ·asmConstants+0x10(SB)/8, $0x070806090d0a040b // zipper merge constant
180+
DATA ·asmConstants+0x18(SB)/8, $0x000f010e05020c03 // zipper merge constant
181181

182-
GLOBL ·constants(SB), 8, $32
182+
GLOBL ·asmConstants(SB), 8, $32

highwayhash_ref.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a license that can be
33
// found in the LICENSE file.
44

5-
// +build noasm !amd64,!ppc64le
5+
// +build noasm !amd64,!arm64,!ppc64le
66

77
package highwayhash
88

0 commit comments

Comments
 (0)