-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathhighwayhash_amd64.s
294 lines (249 loc) · 5.78 KB
/
highwayhash_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
// Copyright (c) 2017 Minio Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build amd64 !gccgo !appengine !nacl
#include "textflag.h"
DATA ·asmConstants<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
DATA ·asmConstants<>+0x08(SB)/8, $0xa4093822299f31d0
DATA ·asmConstants<>+0x10(SB)/8, $0x13198a2e03707344
DATA ·asmConstants<>+0x18(SB)/8, $0x243f6a8885a308d3
DATA ·asmConstants<>+0x20(SB)/8, $0x3bd39e10cb0ef593
DATA ·asmConstants<>+0x28(SB)/8, $0xc0acf169b5f18a8c
DATA ·asmConstants<>+0x30(SB)/8, $0xbe5466cf34e90c6c
DATA ·asmConstants<>+0x38(SB)/8, $0x452821e638d01377
GLOBL ·asmConstants<>(SB), (NOPTR+RODATA), $64
DATA ·asmZipperMerge<>+0x00(SB)/8, $0xf010e05020c03
DATA ·asmZipperMerge<>+0x08(SB)/8, $0x70806090d0a040b
GLOBL ·asmZipperMerge<>(SB), (NOPTR+RODATA), $16
#define v00 X0
#define v01 X1
#define v10 X2
#define v11 X3
#define m00 X4
#define m01 X5
#define m10 X6
#define m11 X7
#define t0 X8
#define t1 X9
#define t2 X10
#define REDUCE_MOD(x0, x1, x2, x3, tmp0, tmp1, y0, y1) \
MOVQ $0x3FFFFFFFFFFFFFFF, tmp0 \
ANDQ tmp0, x3 \
MOVQ x2, y0 \
MOVQ x3, y1 \
\
MOVQ x2, tmp0 \
MOVQ x3, tmp1 \
SHLQ $1, tmp1 \
SHRQ $63, tmp0 \
MOVQ tmp1, x3 \
ORQ tmp0, x3 \
\
SHLQ $1, x2 \
\
MOVQ y0, tmp0 \
MOVQ y1, tmp1 \
SHLQ $2, tmp1 \
SHRQ $62, tmp0 \
MOVQ tmp1, y1 \
ORQ tmp0, y1 \
\
SHLQ $2, y0 \
\
XORQ x0, y0 \
XORQ x2, y0 \
XORQ x1, y1 \
XORQ x3, y1
#define UPDATE(msg0, msg1) \
PADDQ msg0, v10 \
PADDQ m00, v10 \
PADDQ msg1, v11 \
PADDQ m01, v11 \
\
MOVO v00, t0 \
MOVO v01, t1 \
PSRLQ $32, t0 \
PSRLQ $32, t1 \
PMULULQ v10, t0 \
PMULULQ v11, t1 \
PXOR t0, m00 \
PXOR t1, m01 \
\
PADDQ m10, v00 \
PADDQ m11, v01 \
\
MOVO v10, t0 \
MOVO v11, t1 \
PSRLQ $32, t0 \
PSRLQ $32, t1 \
PMULULQ v00, t0 \
PMULULQ v01, t1 \
PXOR t0, m10 \
PXOR t1, m11 \
\
MOVO v10, t0 \
PSHUFB t2, t0 \
MOVO v11, t1 \
PSHUFB t2, t1 \
PADDQ t0, v00 \
PADDQ t1, v01 \
\
MOVO v00, t0 \
PSHUFB t2, t0 \
MOVO v01, t1 \
PSHUFB t2, t1 \
PADDQ t0, v10 \
PADDQ t1, v11
// func initializeSSE4(state *[16]uint64, key []byte)
TEXT ·initializeSSE4(SB), NOSPLIT, $0-32
MOVQ state+0(FP), AX
MOVQ key_base+8(FP), BX
MOVQ $·asmConstants<>(SB), CX
MOVOU 0(BX), v00
MOVOU 16(BX), v01
PSHUFD $177, v00, v10
PSHUFD $177, v01, v11
MOVOU 0(CX), m00
MOVOU 16(CX), m01
MOVOU 32(CX), m10
MOVOU 48(CX), m11
PXOR m00, v00
PXOR m01, v01
PXOR m10, v10
PXOR m11, v11
MOVOU v00, 0(AX)
MOVOU v01, 16(AX)
MOVOU v10, 32(AX)
MOVOU v11, 48(AX)
MOVOU m00, 64(AX)
MOVOU m01, 80(AX)
MOVOU m10, 96(AX)
MOVOU m11, 112(AX)
RET
// func updateSSE4(state *[16]uint64, msg []byte)
TEXT ·updateSSE4(SB), NOSPLIT, $0-32
MOVQ state+0(FP), AX
MOVQ msg_base+8(FP), BX
MOVQ msg_len+16(FP), CX
CMPQ CX, $32
JB DONE
MOVOU 0(AX), v00
MOVOU 16(AX), v01
MOVOU 32(AX), v10
MOVOU 48(AX), v11
MOVOU 64(AX), m00
MOVOU 80(AX), m01
MOVOU 96(AX), m10
MOVOU 112(AX), m11
MOVOU ·asmZipperMerge<>(SB), t2
LOOP:
MOVOU 0(BX), t0
MOVOU 16(BX), t1
UPDATE(t0, t1)
ADDQ $32, BX
SUBQ $32, CX
JA LOOP
MOVOU v00, 0(AX)
MOVOU v01, 16(AX)
MOVOU v10, 32(AX)
MOVOU v11, 48(AX)
MOVOU m00, 64(AX)
MOVOU m01, 80(AX)
MOVOU m10, 96(AX)
MOVOU m11, 112(AX)
DONE:
RET
// func finalizeSSE4(out []byte, state *[16]uint64)
TEXT ·finalizeSSE4(SB), NOSPLIT, $0-32
MOVQ state+24(FP), AX
MOVQ out_base+0(FP), BX
MOVQ out_len+8(FP), CX
MOVOU 0(AX), v00
MOVOU 16(AX), v01
MOVOU 32(AX), v10
MOVOU 48(AX), v11
MOVOU 64(AX), m00
MOVOU 80(AX), m01
MOVOU 96(AX), m10
MOVOU 112(AX), m11
MOVOU ·asmZipperMerge<>(SB), t2
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
CMPQ CX, $8
JE skipUpdate // Just 4 rounds for 64-bit checksum
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
CMPQ CX, $16
JE skipUpdate // 6 rounds for 128-bit checksum
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
UPDATE(t0, t1)
skipUpdate:
MOVOU v00, 0(AX)
MOVOU v01, 16(AX)
MOVOU v10, 32(AX)
MOVOU v11, 48(AX)
MOVOU m00, 64(AX)
MOVOU m01, 80(AX)
MOVOU m10, 96(AX)
MOVOU m11, 112(AX)
CMPQ CX, $8
JE hash64
CMPQ CX, $16
JE hash128
// 256-bit checksum
PADDQ v00, m00
PADDQ v10, m10
PADDQ v01, m01
PADDQ v11, m11
MOVQ m00, R8
PEXTRQ $1, m00, R9
MOVQ m10, R10
PEXTRQ $1, m10, R11
REDUCE_MOD(R8, R9, R10, R11, R12, R13, R14, R15)
MOVQ R14, 0(BX)
MOVQ R15, 8(BX)
MOVQ m01, R8
PEXTRQ $1, m01, R9
MOVQ m11, R10
PEXTRQ $1, m11, R11
REDUCE_MOD(R8, R9, R10, R11, R12, R13, R14, R15)
MOVQ R14, 16(BX)
MOVQ R15, 24(BX)
RET
hash128:
PADDQ v00, v11
PADDQ m00, m11
PADDQ v11, m11
MOVOU m11, 0(BX)
RET
hash64:
PADDQ v00, v10
PADDQ m00, m10
PADDQ v10, m10
MOVQ m10, DX
MOVQ DX, 0(BX)
RET