/Users/brunogarcia/projects/bitcoin-core-dev/src/crypto/sha256_arm_shani.cpp
Line | Count | Source |
1 | | // Copyright (c) 2022 The Bitcoin Core developers |
2 | | // Distributed under the MIT software license, see the accompanying |
3 | | // file COPYING or http://www.opensource.org/licenses/mit-license.php. |
4 | | // |
5 | | // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c, |
6 | | // Written and placed in public domain by Jeffrey Walton. |
7 | | // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and |
8 | | // Barry O'Rourke for the mbedTLS project. |
9 | | // Variant specialized for 64-byte inputs added by Pieter Wuille. |
10 | | |
11 | | #ifdef ENABLE_ARM_SHANI |
12 | | |
13 | | #include <array> |
14 | | #include <cstdint> |
15 | | #include <cstddef> |
16 | | #include <arm_acle.h> |
17 | | #include <arm_neon.h> |
18 | | |
19 | | namespace { |
20 | | alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K = |
21 | | { |
22 | | 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, |
23 | | 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, |
24 | | 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, |
25 | | 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, |
26 | | 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, |
27 | | 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, |
28 | | 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, |
29 | | 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, |
30 | | 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, |
31 | | 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, |
32 | | 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, |
33 | | 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, |
34 | | 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, |
35 | | 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, |
36 | | 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, |
37 | | 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, |
38 | | }; |
39 | | } |
40 | | |
41 | | namespace sha256_arm_shani { |
42 | | void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks) |
43 | 0 | { |
44 | 0 | uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; |
45 | 0 | uint32x4_t MSG0, MSG1, MSG2, MSG3; |
46 | 0 | uint32x4_t TMP0, TMP2; |
47 | | |
48 | | // Load state |
49 | 0 | STATE0 = vld1q_u32(&s[0]); |
50 | 0 | STATE1 = vld1q_u32(&s[4]); |
51 | |
|
52 | 0 | while (blocks--) |
53 | 0 | { |
54 | | // Save state |
55 | 0 | ABEF_SAVE = STATE0; |
56 | 0 | CDGH_SAVE = STATE1; |
57 | | |
58 | | // Load and convert input chunk to Big Endian |
59 | 0 | MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0))); |
60 | 0 | MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16))); |
61 | 0 | MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32))); |
62 | 0 | MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48))); |
63 | 0 | chunk += 64; |
64 | | |
65 | | // Original implementation preloaded message and constant addition which was 1-3% slower. |
66 | | // Now included as first step in quad round code saving one Q Neon register |
67 | | // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));" |
68 | | |
69 | | // Rounds 1-4 |
70 | 0 | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0])); |
71 | 0 | TMP2 = STATE0; |
72 | 0 | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
73 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
74 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
75 | 0 | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
76 | | |
77 | | // Rounds 5-8 |
78 | 0 | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4])); |
79 | 0 | TMP2 = STATE0; |
80 | 0 | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
81 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
82 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
83 | 0 | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
84 | | |
85 | | // Rounds 9-12 |
86 | 0 | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8])); |
87 | 0 | TMP2 = STATE0; |
88 | 0 | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
89 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
90 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
91 | 0 | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
92 | | |
93 | | // Rounds 13-16 |
94 | 0 | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12])); |
95 | 0 | TMP2 = STATE0; |
96 | 0 | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
97 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
98 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
99 | 0 | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
100 | | |
101 | | // Rounds 17-20 |
102 | 0 | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16])); |
103 | 0 | TMP2 = STATE0; |
104 | 0 | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
105 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
106 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
107 | 0 | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
108 | | |
109 | | // Rounds 21-24 |
110 | 0 | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20])); |
111 | 0 | TMP2 = STATE0; |
112 | 0 | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
113 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
114 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
115 | 0 | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
116 | | |
117 | | // Rounds 25-28 |
118 | 0 | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24])); |
119 | 0 | TMP2 = STATE0; |
120 | 0 | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
121 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
122 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
123 | 0 | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
124 | | |
125 | | // Rounds 29-32 |
126 | 0 | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28])); |
127 | 0 | TMP2 = STATE0; |
128 | 0 | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
129 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
130 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
131 | 0 | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
132 | | |
133 | | // Rounds 33-36 |
134 | 0 | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32])); |
135 | 0 | TMP2 = STATE0; |
136 | 0 | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
137 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
138 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
139 | 0 | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
140 | | |
141 | | // Rounds 37-40 |
142 | 0 | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36])); |
143 | 0 | TMP2 = STATE0; |
144 | 0 | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
145 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
146 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
147 | 0 | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
148 | | |
149 | | // Rounds 41-44 |
150 | 0 | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40])); |
151 | 0 | TMP2 = STATE0; |
152 | 0 | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
153 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
154 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
155 | 0 | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
156 | | |
157 | | // Rounds 45-48 |
158 | 0 | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44])); |
159 | 0 | TMP2 = STATE0; |
160 | 0 | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
161 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
162 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
163 | 0 | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
164 | | |
165 | | // Rounds 49-52 |
166 | 0 | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48])); |
167 | 0 | TMP2 = STATE0; |
168 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
169 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
170 | | |
171 | | // Rounds 53-56 |
172 | 0 | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52])); |
173 | 0 | TMP2 = STATE0; |
174 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
175 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
176 | | |
177 | | // Rounds 57-60 |
178 | 0 | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56])); |
179 | 0 | TMP2 = STATE0; |
180 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
181 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
182 | | |
183 | | // Rounds 61-64 |
184 | 0 | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60])); |
185 | 0 | TMP2 = STATE0; |
186 | 0 | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
187 | 0 | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
188 | | |
189 | | // Update state |
190 | 0 | STATE0 = vaddq_u32(STATE0, ABEF_SAVE); |
191 | 0 | STATE1 = vaddq_u32(STATE1, CDGH_SAVE); |
192 | 0 | } |
193 | | |
194 | | // Save final state |
195 | 0 | vst1q_u32(&s[0], STATE0); |
196 | 0 | vst1q_u32(&s[4], STATE1); |
197 | 0 | } |
198 | | } |
199 | | |
200 | | namespace sha256d64_arm_shani { |
201 | | void Transform_2way(unsigned char* output, const unsigned char* input) |
202 | 0 | { |
203 | | /* Initial state. */ |
204 | 0 | alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = { |
205 | 0 | 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, |
206 | 0 | 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 |
207 | 0 | }; |
208 | | |
209 | | /* Precomputed message schedule for the 2nd transform. */ |
210 | 0 | alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = { |
211 | 0 | 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
212 | 0 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
213 | 0 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
214 | 0 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374, |
215 | 0 | 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, |
216 | 0 | 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa, |
217 | 0 | 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, |
218 | 0 | 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0, |
219 | 0 | 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, |
220 | 0 | 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16, |
221 | 0 | 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, |
222 | 0 | 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37, |
223 | 0 | 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, |
224 | 0 | 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890, |
225 | 0 | 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, |
226 | 0 | 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 |
227 | 0 | }; |
228 | | |
229 | | /* A few precomputed message schedule values for the 3rd transform. */ |
230 | 0 | alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = { |
231 | 0 | 0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
232 | 0 | 0x80000000, 0x00000000, 0x00000000, 0x00000000, |
233 | 0 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274 |
234 | 0 | }; |
235 | | |
236 | | /* Padding processed in the 3rd transform (byteswapped). */ |
237 | 0 | alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100}; |
238 | |
|
239 | 0 | uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB; |
240 | 0 | uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B; |
241 | 0 | uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP; |
242 | | |
243 | | // Transform 1: Load state |
244 | 0 | STATE0A = vld1q_u32(&INIT[0]); |
245 | 0 | STATE0B = STATE0A; |
246 | 0 | STATE1A = vld1q_u32(&INIT[4]); |
247 | 0 | STATE1B = STATE1A; |
248 | | |
249 | | // Transform 1: Load and convert input chunk to Big Endian |
250 | 0 | MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0))); |
251 | 0 | MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16))); |
252 | 0 | MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32))); |
253 | 0 | MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48))); |
254 | 0 | MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64))); |
255 | 0 | MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80))); |
256 | 0 | MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96))); |
257 | 0 | MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112))); |
258 | | |
259 | | // Transform 1: Rounds 1-4 |
260 | 0 | TMP = vld1q_u32(&K[0]); |
261 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
262 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
263 | 0 | TMP2A = STATE0A; |
264 | 0 | TMP2B = STATE0B; |
265 | 0 | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
266 | 0 | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
267 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
268 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
269 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
270 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
271 | 0 | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
272 | 0 | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
273 | | |
274 | | // Transform 1: Rounds 5-8 |
275 | 0 | TMP = vld1q_u32(&K[4]); |
276 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
277 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
278 | 0 | TMP2A = STATE0A; |
279 | 0 | TMP2B = STATE0B; |
280 | 0 | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
281 | 0 | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
282 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
283 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
284 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
285 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
286 | 0 | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
287 | 0 | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
288 | | |
289 | | // Transform 1: Rounds 9-12 |
290 | 0 | TMP = vld1q_u32(&K[8]); |
291 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
292 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
293 | 0 | TMP2A = STATE0A; |
294 | 0 | TMP2B = STATE0B; |
295 | 0 | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
296 | 0 | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
297 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
298 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
299 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
300 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
301 | 0 | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
302 | 0 | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
303 | | |
304 | | // Transform 1: Rounds 13-16 |
305 | 0 | TMP = vld1q_u32(&K[12]); |
306 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
307 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
308 | 0 | TMP2A = STATE0A; |
309 | 0 | TMP2B = STATE0B; |
310 | 0 | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
311 | 0 | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
312 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
313 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
314 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
315 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
316 | 0 | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
317 | 0 | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
318 | | |
319 | | // Transform 1: Rounds 17-20 |
320 | 0 | TMP = vld1q_u32(&K[16]); |
321 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
322 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
323 | 0 | TMP2A = STATE0A; |
324 | 0 | TMP2B = STATE0B; |
325 | 0 | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
326 | 0 | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
327 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
328 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
329 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
330 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
331 | 0 | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
332 | 0 | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
333 | | |
334 | | // Transform 1: Rounds 21-24 |
335 | 0 | TMP = vld1q_u32(&K[20]); |
336 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
337 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
338 | 0 | TMP2A = STATE0A; |
339 | 0 | TMP2B = STATE0B; |
340 | 0 | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
341 | 0 | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
342 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
343 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
344 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
345 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
346 | 0 | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
347 | 0 | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
348 | | |
349 | | // Transform 1: Rounds 25-28 |
350 | 0 | TMP = vld1q_u32(&K[24]); |
351 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
352 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
353 | 0 | TMP2A = STATE0A; |
354 | 0 | TMP2B = STATE0B; |
355 | 0 | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
356 | 0 | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
357 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
358 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
359 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
360 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
361 | 0 | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
362 | 0 | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
363 | | |
364 | | // Transform 1: Rounds 29-32 |
365 | 0 | TMP = vld1q_u32(&K[28]); |
366 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
367 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
368 | 0 | TMP2A = STATE0A; |
369 | 0 | TMP2B = STATE0B; |
370 | 0 | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
371 | 0 | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
372 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
373 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
374 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
375 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
376 | 0 | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
377 | 0 | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
378 | | |
379 | | // Transform 1: Rounds 33-36 |
380 | 0 | TMP = vld1q_u32(&K[32]); |
381 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
382 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
383 | 0 | TMP2A = STATE0A; |
384 | 0 | TMP2B = STATE0B; |
385 | 0 | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
386 | 0 | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
387 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
388 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
389 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
390 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
391 | 0 | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
392 | 0 | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
393 | | |
394 | | // Transform 1: Rounds 37-40 |
395 | 0 | TMP = vld1q_u32(&K[36]); |
396 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
397 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
398 | 0 | TMP2A = STATE0A; |
399 | 0 | TMP2B = STATE0B; |
400 | 0 | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
401 | 0 | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
402 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
403 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
404 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
405 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
406 | 0 | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
407 | 0 | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
408 | | |
409 | | // Transform 1: Rounds 41-44 |
410 | 0 | TMP = vld1q_u32(&K[40]); |
411 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
412 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
413 | 0 | TMP2A = STATE0A; |
414 | 0 | TMP2B = STATE0B; |
415 | 0 | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
416 | 0 | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
417 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
418 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
419 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
420 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
421 | 0 | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
422 | 0 | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
423 | | |
424 | | // Transform 1: Rounds 45-48 |
425 | 0 | TMP = vld1q_u32(&K[44]); |
426 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
427 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
428 | 0 | TMP2A = STATE0A; |
429 | 0 | TMP2B = STATE0B; |
430 | 0 | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
431 | 0 | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
432 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
433 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
434 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
435 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
436 | 0 | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
437 | 0 | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
438 | | |
439 | | // Transform 1: Rounds 49-52 |
440 | 0 | TMP = vld1q_u32(&K[48]); |
441 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
442 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
443 | 0 | TMP2A = STATE0A; |
444 | 0 | TMP2B = STATE0B; |
445 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
446 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
447 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
448 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
449 | | |
450 | | // Transform 1: Rounds 53-56 |
451 | 0 | TMP = vld1q_u32(&K[52]); |
452 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
453 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
454 | 0 | TMP2A = STATE0A; |
455 | 0 | TMP2B = STATE0B; |
456 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
457 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
458 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
459 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
460 | | |
461 | | // Transform 1: Rounds 57-60 |
462 | 0 | TMP = vld1q_u32(&K[56]); |
463 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
464 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
465 | 0 | TMP2A = STATE0A; |
466 | 0 | TMP2B = STATE0B; |
467 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
468 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
469 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
470 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
471 | | |
472 | | // Transform 1: Rounds 61-64 |
473 | 0 | TMP = vld1q_u32(&K[60]); |
474 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
475 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
476 | 0 | TMP2A = STATE0A; |
477 | 0 | TMP2B = STATE0B; |
478 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
479 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
480 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
481 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
482 | | |
483 | | // Transform 1: Update state |
484 | 0 | TMP = vld1q_u32(&INIT[0]); |
485 | 0 | STATE0A = vaddq_u32(STATE0A, TMP); |
486 | 0 | STATE0B = vaddq_u32(STATE0B, TMP); |
487 | 0 | TMP = vld1q_u32(&INIT[4]); |
488 | 0 | STATE1A = vaddq_u32(STATE1A, TMP); |
489 | 0 | STATE1B = vaddq_u32(STATE1B, TMP); |
490 | | |
491 | | // Transform 2: Save state |
492 | 0 | ABEF_SAVEA = STATE0A; |
493 | 0 | ABEF_SAVEB = STATE0B; |
494 | 0 | CDGH_SAVEA = STATE1A; |
495 | 0 | CDGH_SAVEB = STATE1B; |
496 | | |
497 | | // Transform 2: Rounds 1-4 |
498 | 0 | TMP = vld1q_u32(&MIDS[0]); |
499 | 0 | TMP2A = STATE0A; |
500 | 0 | TMP2B = STATE0B; |
501 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
502 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
503 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
504 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
505 | | |
506 | | // Transform 2: Rounds 5-8 |
507 | 0 | TMP = vld1q_u32(&MIDS[4]); |
508 | 0 | TMP2A = STATE0A; |
509 | 0 | TMP2B = STATE0B; |
510 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
511 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
512 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
513 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
514 | | |
515 | | // Transform 2: Rounds 9-12 |
516 | 0 | TMP = vld1q_u32(&MIDS[8]); |
517 | 0 | TMP2A = STATE0A; |
518 | 0 | TMP2B = STATE0B; |
519 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
520 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
521 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
522 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
523 | | |
524 | | // Transform 2: Rounds 13-16 |
525 | 0 | TMP = vld1q_u32(&MIDS[12]); |
526 | 0 | TMP2A = STATE0A; |
527 | 0 | TMP2B = STATE0B; |
528 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
529 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
530 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
531 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
532 | | |
533 | | // Transform 2: Rounds 17-20 |
534 | 0 | TMP = vld1q_u32(&MIDS[16]); |
535 | 0 | TMP2A = STATE0A; |
536 | 0 | TMP2B = STATE0B; |
537 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
538 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
539 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
540 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
541 | | |
542 | | // Transform 2: Rounds 21-24 |
543 | 0 | TMP = vld1q_u32(&MIDS[20]); |
544 | 0 | TMP2A = STATE0A; |
545 | 0 | TMP2B = STATE0B; |
546 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
547 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
548 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
549 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
550 | | |
551 | | // Transform 2: Rounds 25-28 |
552 | 0 | TMP = vld1q_u32(&MIDS[24]); |
553 | 0 | TMP2A = STATE0A; |
554 | 0 | TMP2B = STATE0B; |
555 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
556 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
557 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
558 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
559 | | |
560 | | // Transform 2: Rounds 29-32 |
561 | 0 | TMP = vld1q_u32(&MIDS[28]); |
562 | 0 | TMP2A = STATE0A; |
563 | 0 | TMP2B = STATE0B; |
564 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
565 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
566 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
567 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
568 | | |
569 | | // Transform 2: Rounds 33-36 |
570 | 0 | TMP = vld1q_u32(&MIDS[32]); |
571 | 0 | TMP2A = STATE0A; |
572 | 0 | TMP2B = STATE0B; |
573 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
574 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
575 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
576 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
577 | | |
578 | | // Transform 2: Rounds 37-40 |
579 | 0 | TMP = vld1q_u32(&MIDS[36]); |
580 | 0 | TMP2A = STATE0A; |
581 | 0 | TMP2B = STATE0B; |
582 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
583 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
584 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
585 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
586 | | |
587 | | // Transform 2: Rounds 41-44 |
588 | 0 | TMP = vld1q_u32(&MIDS[40]); |
589 | 0 | TMP2A = STATE0A; |
590 | 0 | TMP2B = STATE0B; |
591 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
592 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
593 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
594 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
595 | | |
596 | | // Transform 2: Rounds 45-48 |
597 | 0 | TMP = vld1q_u32(&MIDS[44]); |
598 | 0 | TMP2A = STATE0A; |
599 | 0 | TMP2B = STATE0B; |
600 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
601 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
602 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
603 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
604 | | |
605 | | // Transform 2: Rounds 49-52 |
606 | 0 | TMP = vld1q_u32(&MIDS[48]); |
607 | 0 | TMP2A = STATE0A; |
608 | 0 | TMP2B = STATE0B; |
609 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
610 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
611 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
612 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
613 | | |
614 | | // Transform 2: Rounds 53-56 |
615 | 0 | TMP = vld1q_u32(&MIDS[52]); |
616 | 0 | TMP2A = STATE0A; |
617 | 0 | TMP2B = STATE0B; |
618 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
619 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
620 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
621 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
622 | | |
623 | | // Transform 2: Rounds 57-60 |
624 | 0 | TMP = vld1q_u32(&MIDS[56]); |
625 | 0 | TMP2A = STATE0A; |
626 | 0 | TMP2B = STATE0B; |
627 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
628 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
629 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
630 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
631 | | |
632 | | // Transform 2: Rounds 61-64 |
633 | 0 | TMP = vld1q_u32(&MIDS[60]); |
634 | 0 | TMP2A = STATE0A; |
635 | 0 | TMP2B = STATE0B; |
636 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
637 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
638 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
639 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
640 | | |
641 | | // Transform 2: Update state |
642 | 0 | STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA); |
643 | 0 | STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB); |
644 | 0 | STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA); |
645 | 0 | STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB); |
646 | | |
647 | | // Transform 3: Pad previous output |
648 | 0 | MSG0A = STATE0A; |
649 | 0 | MSG0B = STATE0B; |
650 | 0 | MSG1A = STATE1A; |
651 | 0 | MSG1B = STATE1B; |
652 | 0 | MSG2A = vld1q_u32(&FINAL[0]); |
653 | 0 | MSG2B = MSG2A; |
654 | 0 | MSG3A = vld1q_u32(&FINAL[4]); |
655 | 0 | MSG3B = MSG3A; |
656 | | |
657 | | // Transform 3: Load state |
658 | 0 | STATE0A = vld1q_u32(&INIT[0]); |
659 | 0 | STATE0B = STATE0A; |
660 | 0 | STATE1A = vld1q_u32(&INIT[4]); |
661 | 0 | STATE1B = STATE1A; |
662 | | |
663 | | // Transform 3: Rounds 1-4 |
664 | 0 | TMP = vld1q_u32(&K[0]); |
665 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
666 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
667 | 0 | TMP2A = STATE0A; |
668 | 0 | TMP2B = STATE0B; |
669 | 0 | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
670 | 0 | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
671 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
672 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
673 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
674 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
675 | 0 | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
676 | 0 | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
677 | | |
678 | | // Transform 3: Rounds 5-8 |
679 | 0 | TMP = vld1q_u32(&K[4]); |
680 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
681 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
682 | 0 | TMP2A = STATE0A; |
683 | 0 | TMP2B = STATE0B; |
684 | 0 | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
685 | 0 | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
686 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
687 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
688 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
689 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
690 | 0 | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
691 | 0 | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
692 | | |
693 | | // Transform 3: Rounds 9-12 |
694 | 0 | TMP = vld1q_u32(&FINS[0]); |
695 | 0 | TMP2A = STATE0A; |
696 | 0 | TMP2B = STATE0B; |
697 | 0 | MSG2A = vld1q_u32(&FINS[4]); |
698 | 0 | MSG2B = MSG2A; |
699 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
700 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
701 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
702 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
703 | 0 | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
704 | 0 | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
705 | | |
706 | | // Transform 3: Rounds 13-16 |
707 | 0 | TMP = vld1q_u32(&FINS[8]); |
708 | 0 | TMP2A = STATE0A; |
709 | 0 | TMP2B = STATE0B; |
710 | 0 | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
711 | 0 | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
712 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
713 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
714 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
715 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
716 | 0 | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
717 | 0 | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
718 | | |
719 | | // Transform 3: Rounds 17-20 |
720 | 0 | TMP = vld1q_u32(&K[16]); |
721 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
722 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
723 | 0 | TMP2A = STATE0A; |
724 | 0 | TMP2B = STATE0B; |
725 | 0 | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
726 | 0 | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
727 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
728 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
729 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
730 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
731 | 0 | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
732 | 0 | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
733 | | |
734 | | // Transform 3: Rounds 21-24 |
735 | 0 | TMP = vld1q_u32(&K[20]); |
736 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
737 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
738 | 0 | TMP2A = STATE0A; |
739 | 0 | TMP2B = STATE0B; |
740 | 0 | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
741 | 0 | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
742 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
743 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
744 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
745 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
746 | 0 | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
747 | 0 | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
748 | | |
749 | | // Transform 3: Rounds 25-28 |
750 | 0 | TMP = vld1q_u32(&K[24]); |
751 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
752 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
753 | 0 | TMP2A = STATE0A; |
754 | 0 | TMP2B = STATE0B; |
755 | 0 | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
756 | 0 | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
757 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
758 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
759 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
760 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
761 | 0 | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
762 | 0 | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
763 | | |
764 | | // Transform 3: Rounds 29-32 |
765 | 0 | TMP = vld1q_u32(&K[28]); |
766 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
767 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
768 | 0 | TMP2A = STATE0A; |
769 | 0 | TMP2B = STATE0B; |
770 | 0 | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
771 | 0 | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
772 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
773 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
774 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
775 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
776 | 0 | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
777 | 0 | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
778 | | |
779 | | // Transform 3: Rounds 33-36 |
780 | 0 | TMP = vld1q_u32(&K[32]); |
781 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
782 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
783 | 0 | TMP2A = STATE0A; |
784 | 0 | TMP2B = STATE0B; |
785 | 0 | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
786 | 0 | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
787 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
788 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
789 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
790 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
791 | 0 | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
792 | 0 | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
793 | | |
794 | | // Transform 3: Rounds 37-40 |
795 | 0 | TMP = vld1q_u32(&K[36]); |
796 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
797 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
798 | 0 | TMP2A = STATE0A; |
799 | 0 | TMP2B = STATE0B; |
800 | 0 | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
801 | 0 | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
802 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
803 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
804 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
805 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
806 | 0 | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
807 | 0 | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
808 | | |
809 | | // Transform 3: Rounds 41-44 |
810 | 0 | TMP = vld1q_u32(&K[40]); |
811 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
812 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
813 | 0 | TMP2A = STATE0A; |
814 | 0 | TMP2B = STATE0B; |
815 | 0 | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
816 | 0 | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
817 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
818 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
819 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
820 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
821 | 0 | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
822 | 0 | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
823 | | |
824 | | // Transform 3: Rounds 45-48 |
825 | 0 | TMP = vld1q_u32(&K[44]); |
826 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
827 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
828 | 0 | TMP2A = STATE0A; |
829 | 0 | TMP2B = STATE0B; |
830 | 0 | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
831 | 0 | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
832 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
833 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
834 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
835 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
836 | 0 | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
837 | 0 | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
838 | | |
839 | | // Transform 3: Rounds 49-52 |
840 | 0 | TMP = vld1q_u32(&K[48]); |
841 | 0 | TMP0A = vaddq_u32(MSG0A, TMP); |
842 | 0 | TMP0B = vaddq_u32(MSG0B, TMP); |
843 | 0 | TMP2A = STATE0A; |
844 | 0 | TMP2B = STATE0B; |
845 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
846 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
847 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
848 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
849 | | |
850 | | // Transform 3: Rounds 53-56 |
851 | 0 | TMP = vld1q_u32(&K[52]); |
852 | 0 | TMP0A = vaddq_u32(MSG1A, TMP); |
853 | 0 | TMP0B = vaddq_u32(MSG1B, TMP); |
854 | 0 | TMP2A = STATE0A; |
855 | 0 | TMP2B = STATE0B; |
856 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
857 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
858 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
859 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
860 | | |
861 | | // Transform 3: Rounds 57-60 |
862 | 0 | TMP = vld1q_u32(&K[56]); |
863 | 0 | TMP0A = vaddq_u32(MSG2A, TMP); |
864 | 0 | TMP0B = vaddq_u32(MSG2B, TMP); |
865 | 0 | TMP2A = STATE0A; |
866 | 0 | TMP2B = STATE0B; |
867 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
868 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
869 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
870 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
871 | | |
872 | | // Transform 3: Rounds 61-64 |
873 | 0 | TMP = vld1q_u32(&K[60]); |
874 | 0 | TMP0A = vaddq_u32(MSG3A, TMP); |
875 | 0 | TMP0B = vaddq_u32(MSG3B, TMP); |
876 | 0 | TMP2A = STATE0A; |
877 | 0 | TMP2B = STATE0B; |
878 | 0 | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
879 | 0 | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
880 | 0 | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
881 | 0 | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
882 | | |
883 | | // Transform 3: Update state |
884 | 0 | TMP = vld1q_u32(&INIT[0]); |
885 | 0 | STATE0A = vaddq_u32(STATE0A, TMP); |
886 | 0 | STATE0B = vaddq_u32(STATE0B, TMP); |
887 | 0 | TMP = vld1q_u32(&INIT[4]); |
888 | 0 | STATE1A = vaddq_u32(STATE1A, TMP); |
889 | 0 | STATE1B = vaddq_u32(STATE1B, TMP); |
890 | | |
891 | | // Store result |
892 | 0 | vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A))); |
893 | 0 | vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A))); |
894 | 0 | vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B))); |
895 | | vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B))); |
896 | 0 | } |
897 | | } |
898 | | |
899 | | #endif |