Bitcoin Core Fuzz Coverage Report for #26966

Coverage Report

Created: 2025-10-10 09:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/brunogarcia/projects/bitcoin-core-dev/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022 The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_acle.h>
17
#include <arm_neon.h>
18
19
namespace {
20
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
21
{
22
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
23
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
24
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
25
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
26
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
27
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
28
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
29
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
30
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
31
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
32
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
33
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
34
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
35
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
36
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
37
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
38
};
39
}
40
41
namespace sha256_arm_shani {
42
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
43
0
{
44
0
    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
45
0
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
46
0
    uint32x4_t TMP0, TMP2;
47
48
    // Load state
49
0
    STATE0 = vld1q_u32(&s[0]);
50
0
    STATE1 = vld1q_u32(&s[4]);
51
52
0
    while (blocks--)
53
0
    {
54
        // Save state
55
0
        ABEF_SAVE = STATE0;
56
0
        CDGH_SAVE = STATE1;
57
58
        // Load and convert input chunk to Big Endian
59
0
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
60
0
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
61
0
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
62
0
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
63
0
        chunk += 64;
64
65
        // Original implementation preloaded message and constant addition which was 1-3% slower.
66
        // Now included as first step in quad round code saving one Q Neon register
67
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
68
69
        // Rounds 1-4
70
0
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
71
0
        TMP2 = STATE0;
72
0
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
73
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
74
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
75
0
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
76
77
        // Rounds 5-8
78
0
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
79
0
        TMP2 = STATE0;
80
0
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
81
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
82
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
83
0
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
84
85
        // Rounds 9-12
86
0
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
87
0
        TMP2 = STATE0;
88
0
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
89
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
90
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
91
0
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
92
93
        // Rounds 13-16
94
0
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
95
0
        TMP2 = STATE0;
96
0
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
97
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
98
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
99
0
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
100
101
        // Rounds 17-20
102
0
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
103
0
        TMP2 = STATE0;
104
0
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
105
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
106
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
107
0
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
108
109
        // Rounds 21-24
110
0
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
111
0
        TMP2 = STATE0;
112
0
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
113
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
114
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
115
0
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
116
117
        // Rounds 25-28
118
0
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
119
0
        TMP2 = STATE0;
120
0
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
121
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
122
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
123
0
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
124
125
        // Rounds 29-32
126
0
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
127
0
        TMP2 = STATE0;
128
0
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
129
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
130
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
131
0
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
132
133
        // Rounds 33-36
134
0
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
135
0
        TMP2 = STATE0;
136
0
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
137
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
138
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
139
0
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
140
141
        // Rounds 37-40
142
0
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
143
0
        TMP2 = STATE0;
144
0
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
145
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
146
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
147
0
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
148
149
        // Rounds 41-44
150
0
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
151
0
        TMP2 = STATE0;
152
0
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
153
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
154
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
155
0
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
156
157
        // Rounds 45-48
158
0
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
159
0
        TMP2 = STATE0;
160
0
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
161
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
162
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
163
0
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
164
165
        // Rounds 49-52
166
0
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
167
0
        TMP2 = STATE0;
168
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
169
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
170
171
        // Rounds 53-56
172
0
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
173
0
        TMP2 = STATE0;
174
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
175
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
176
177
        // Rounds 57-60
178
0
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
179
0
        TMP2 = STATE0;
180
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
181
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
182
183
        // Rounds 61-64
184
0
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
185
0
        TMP2 = STATE0;
186
0
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
187
0
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
188
189
        // Update state
190
0
        STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
191
0
        STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
192
0
    }
193
194
    // Save final state
195
0
    vst1q_u32(&s[0], STATE0);
196
0
    vst1q_u32(&s[4], STATE1);
197
0
}
198
}
199
200
namespace sha256d64_arm_shani {
201
void Transform_2way(unsigned char* output, const unsigned char* input)
202
0
{
203
    /* Initial state. */
204
0
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
205
0
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
206
0
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
207
0
    };
208
209
    /* Precomputed message schedule for the 2nd transform. */
210
0
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
211
0
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
212
0
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
213
0
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
214
0
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
215
0
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
216
0
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
217
0
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
218
0
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
219
0
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
220
0
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
221
0
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
222
0
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
223
0
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
224
0
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
225
0
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
226
0
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
227
0
    };
228
229
    /* A few precomputed message schedule values for the 3rd transform. */
230
0
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
231
0
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
232
0
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
233
0
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
234
0
    };
235
236
    /* Padding processed in the 3rd transform (byteswapped). */
237
0
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
238
239
0
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
240
0
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
241
0
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
242
243
    // Transform 1: Load state
244
0
    STATE0A = vld1q_u32(&INIT[0]);
245
0
    STATE0B = STATE0A;
246
0
    STATE1A = vld1q_u32(&INIT[4]);
247
0
    STATE1B = STATE1A;
248
249
    // Transform 1: Load and convert input chunk to Big Endian
250
0
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
251
0
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
252
0
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
253
0
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
254
0
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
255
0
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
256
0
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
257
0
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
258
259
    // Transform 1: Rounds 1-4
260
0
    TMP = vld1q_u32(&K[0]);
261
0
    TMP0A = vaddq_u32(MSG0A, TMP);
262
0
    TMP0B = vaddq_u32(MSG0B, TMP);
263
0
    TMP2A = STATE0A;
264
0
    TMP2B = STATE0B;
265
0
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
266
0
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
267
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
268
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
269
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
270
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
271
0
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
272
0
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
273
274
    // Transform 1: Rounds 5-8
275
0
    TMP = vld1q_u32(&K[4]);
276
0
    TMP0A = vaddq_u32(MSG1A, TMP);
277
0
    TMP0B = vaddq_u32(MSG1B, TMP);
278
0
    TMP2A = STATE0A;
279
0
    TMP2B = STATE0B;
280
0
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
281
0
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
282
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
283
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
284
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
285
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
286
0
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
287
0
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
288
289
    // Transform 1: Rounds 9-12
290
0
    TMP = vld1q_u32(&K[8]);
291
0
    TMP0A = vaddq_u32(MSG2A, TMP);
292
0
    TMP0B = vaddq_u32(MSG2B, TMP);
293
0
    TMP2A = STATE0A;
294
0
    TMP2B = STATE0B;
295
0
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
296
0
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
297
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
298
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
299
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
300
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
301
0
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
302
0
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
303
304
    // Transform 1: Rounds 13-16
305
0
    TMP = vld1q_u32(&K[12]);
306
0
    TMP0A = vaddq_u32(MSG3A, TMP);
307
0
    TMP0B = vaddq_u32(MSG3B, TMP);
308
0
    TMP2A = STATE0A;
309
0
    TMP2B = STATE0B;
310
0
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
311
0
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
312
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
313
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
314
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
315
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
316
0
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
317
0
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
318
319
    // Transform 1: Rounds 17-20
320
0
    TMP = vld1q_u32(&K[16]);
321
0
    TMP0A = vaddq_u32(MSG0A, TMP);
322
0
    TMP0B = vaddq_u32(MSG0B, TMP);
323
0
    TMP2A = STATE0A;
324
0
    TMP2B = STATE0B;
325
0
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
326
0
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
327
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
328
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
329
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
330
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
331
0
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
332
0
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
333
334
    // Transform 1: Rounds 21-24
335
0
    TMP = vld1q_u32(&K[20]);
336
0
    TMP0A = vaddq_u32(MSG1A, TMP);
337
0
    TMP0B = vaddq_u32(MSG1B, TMP);
338
0
    TMP2A = STATE0A;
339
0
    TMP2B = STATE0B;
340
0
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
341
0
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
342
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
343
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
344
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
345
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
346
0
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
347
0
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
348
349
    // Transform 1: Rounds 25-28
350
0
    TMP = vld1q_u32(&K[24]);
351
0
    TMP0A = vaddq_u32(MSG2A, TMP);
352
0
    TMP0B = vaddq_u32(MSG2B, TMP);
353
0
    TMP2A = STATE0A;
354
0
    TMP2B = STATE0B;
355
0
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
356
0
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
357
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
358
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
359
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
360
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
361
0
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
362
0
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
363
364
    // Transform 1: Rounds 29-32
365
0
    TMP = vld1q_u32(&K[28]);
366
0
    TMP0A = vaddq_u32(MSG3A, TMP);
367
0
    TMP0B = vaddq_u32(MSG3B, TMP);
368
0
    TMP2A = STATE0A;
369
0
    TMP2B = STATE0B;
370
0
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
371
0
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
372
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
373
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
374
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
375
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
376
0
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
377
0
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
378
379
    // Transform 1: Rounds 33-36
380
0
    TMP = vld1q_u32(&K[32]);
381
0
    TMP0A = vaddq_u32(MSG0A, TMP);
382
0
    TMP0B = vaddq_u32(MSG0B, TMP);
383
0
    TMP2A = STATE0A;
384
0
    TMP2B = STATE0B;
385
0
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
386
0
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
387
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
388
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
389
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
390
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
391
0
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
392
0
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
393
394
    // Transform 1: Rounds 37-40
395
0
    TMP = vld1q_u32(&K[36]);
396
0
    TMP0A = vaddq_u32(MSG1A, TMP);
397
0
    TMP0B = vaddq_u32(MSG1B, TMP);
398
0
    TMP2A = STATE0A;
399
0
    TMP2B = STATE0B;
400
0
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
401
0
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
402
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
403
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
404
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
405
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
406
0
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
407
0
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
408
409
    // Transform 1: Rounds 41-44
410
0
    TMP = vld1q_u32(&K[40]);
411
0
    TMP0A = vaddq_u32(MSG2A, TMP);
412
0
    TMP0B = vaddq_u32(MSG2B, TMP);
413
0
    TMP2A = STATE0A;
414
0
    TMP2B = STATE0B;
415
0
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
416
0
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
417
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
418
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
419
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
420
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
421
0
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
422
0
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
423
424
    // Transform 1: Rounds 45-48
425
0
    TMP = vld1q_u32(&K[44]);
426
0
    TMP0A = vaddq_u32(MSG3A, TMP);
427
0
    TMP0B = vaddq_u32(MSG3B, TMP);
428
0
    TMP2A = STATE0A;
429
0
    TMP2B = STATE0B;
430
0
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
431
0
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
432
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
433
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
434
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
435
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
436
0
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
437
0
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
438
439
    // Transform 1: Rounds 49-52
440
0
    TMP = vld1q_u32(&K[48]);
441
0
    TMP0A = vaddq_u32(MSG0A, TMP);
442
0
    TMP0B = vaddq_u32(MSG0B, TMP);
443
0
    TMP2A = STATE0A;
444
0
    TMP2B = STATE0B;
445
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
446
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
447
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
448
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
449
450
    // Transform 1: Rounds 53-56
451
0
    TMP = vld1q_u32(&K[52]);
452
0
    TMP0A = vaddq_u32(MSG1A, TMP);
453
0
    TMP0B = vaddq_u32(MSG1B, TMP);
454
0
    TMP2A = STATE0A;
455
0
    TMP2B = STATE0B;
456
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
457
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
458
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
459
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
460
461
    // Transform 1: Rounds 57-60
462
0
    TMP = vld1q_u32(&K[56]);
463
0
    TMP0A = vaddq_u32(MSG2A, TMP);
464
0
    TMP0B = vaddq_u32(MSG2B, TMP);
465
0
    TMP2A = STATE0A;
466
0
    TMP2B = STATE0B;
467
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
468
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
469
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
470
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
471
472
    // Transform 1: Rounds 61-64
473
0
    TMP = vld1q_u32(&K[60]);
474
0
    TMP0A = vaddq_u32(MSG3A, TMP);
475
0
    TMP0B = vaddq_u32(MSG3B, TMP);
476
0
    TMP2A = STATE0A;
477
0
    TMP2B = STATE0B;
478
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
479
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
480
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
481
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
482
483
    // Transform 1: Update state
484
0
    TMP = vld1q_u32(&INIT[0]);
485
0
    STATE0A = vaddq_u32(STATE0A, TMP);
486
0
    STATE0B = vaddq_u32(STATE0B, TMP);
487
0
    TMP = vld1q_u32(&INIT[4]);
488
0
    STATE1A = vaddq_u32(STATE1A, TMP);
489
0
    STATE1B = vaddq_u32(STATE1B, TMP);
490
491
    // Transform 2: Save state
492
0
    ABEF_SAVEA = STATE0A;
493
0
    ABEF_SAVEB = STATE0B;
494
0
    CDGH_SAVEA = STATE1A;
495
0
    CDGH_SAVEB = STATE1B;
496
497
    // Transform 2: Rounds 1-4
498
0
    TMP = vld1q_u32(&MIDS[0]);
499
0
    TMP2A = STATE0A;
500
0
    TMP2B = STATE0B;
501
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
502
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
503
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
504
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
505
506
    // Transform 2: Rounds 5-8
507
0
    TMP = vld1q_u32(&MIDS[4]);
508
0
    TMP2A = STATE0A;
509
0
    TMP2B = STATE0B;
510
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
511
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
512
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
513
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
514
515
    // Transform 2: Rounds 9-12
516
0
    TMP = vld1q_u32(&MIDS[8]);
517
0
    TMP2A = STATE0A;
518
0
    TMP2B = STATE0B;
519
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
520
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
521
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
522
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
523
524
    // Transform 2: Rounds 13-16
525
0
    TMP = vld1q_u32(&MIDS[12]);
526
0
    TMP2A = STATE0A;
527
0
    TMP2B = STATE0B;
528
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
529
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
530
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
531
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
532
533
    // Transform 2: Rounds 17-20
534
0
    TMP = vld1q_u32(&MIDS[16]);
535
0
    TMP2A = STATE0A;
536
0
    TMP2B = STATE0B;
537
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
538
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
539
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
540
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
541
542
    // Transform 2: Rounds 21-24
543
0
    TMP = vld1q_u32(&MIDS[20]);
544
0
    TMP2A = STATE0A;
545
0
    TMP2B = STATE0B;
546
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
547
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
548
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
549
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
550
551
    // Transform 2: Rounds 25-28
552
0
    TMP = vld1q_u32(&MIDS[24]);
553
0
    TMP2A = STATE0A;
554
0
    TMP2B = STATE0B;
555
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
556
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
557
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
558
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
559
560
    // Transform 2: Rounds 29-32
561
0
    TMP = vld1q_u32(&MIDS[28]);
562
0
    TMP2A = STATE0A;
563
0
    TMP2B = STATE0B;
564
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
565
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
566
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
567
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
568
569
    // Transform 2: Rounds 33-36
570
0
    TMP = vld1q_u32(&MIDS[32]);
571
0
    TMP2A = STATE0A;
572
0
    TMP2B = STATE0B;
573
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
574
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
575
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
576
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
577
578
    // Transform 2: Rounds 37-40
579
0
    TMP = vld1q_u32(&MIDS[36]);
580
0
    TMP2A = STATE0A;
581
0
    TMP2B = STATE0B;
582
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
583
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
584
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
585
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
586
587
    // Transform 2: Rounds 41-44
588
0
    TMP = vld1q_u32(&MIDS[40]);
589
0
    TMP2A = STATE0A;
590
0
    TMP2B = STATE0B;
591
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
592
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
593
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
594
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
595
596
    // Transform 2: Rounds 45-48
597
0
    TMP = vld1q_u32(&MIDS[44]);
598
0
    TMP2A = STATE0A;
599
0
    TMP2B = STATE0B;
600
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
601
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
602
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
603
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
604
605
    // Transform 2: Rounds 49-52
606
0
    TMP = vld1q_u32(&MIDS[48]);
607
0
    TMP2A = STATE0A;
608
0
    TMP2B = STATE0B;
609
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
610
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
611
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
612
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
613
614
    // Transform 2: Rounds 53-56
615
0
    TMP = vld1q_u32(&MIDS[52]);
616
0
    TMP2A = STATE0A;
617
0
    TMP2B = STATE0B;
618
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
619
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
620
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
621
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
622
623
    // Transform 2: Rounds 57-60
624
0
    TMP = vld1q_u32(&MIDS[56]);
625
0
    TMP2A = STATE0A;
626
0
    TMP2B = STATE0B;
627
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
628
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
629
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
630
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
631
632
    // Transform 2: Rounds 61-64
633
0
    TMP = vld1q_u32(&MIDS[60]);
634
0
    TMP2A = STATE0A;
635
0
    TMP2B = STATE0B;
636
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
637
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
638
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
639
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
640
641
    // Transform 2: Update state
642
0
    STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
643
0
    STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
644
0
    STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
645
0
    STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
646
647
    // Transform 3: Pad previous output
648
0
    MSG0A = STATE0A;
649
0
    MSG0B = STATE0B;
650
0
    MSG1A = STATE1A;
651
0
    MSG1B = STATE1B;
652
0
    MSG2A = vld1q_u32(&FINAL[0]);
653
0
    MSG2B = MSG2A;
654
0
    MSG3A = vld1q_u32(&FINAL[4]);
655
0
    MSG3B = MSG3A;
656
657
    // Transform 3: Load state
658
0
    STATE0A = vld1q_u32(&INIT[0]);
659
0
    STATE0B = STATE0A;
660
0
    STATE1A = vld1q_u32(&INIT[4]);
661
0
    STATE1B = STATE1A;
662
663
    // Transform 3: Rounds 1-4
664
0
    TMP = vld1q_u32(&K[0]);
665
0
    TMP0A = vaddq_u32(MSG0A, TMP);
666
0
    TMP0B = vaddq_u32(MSG0B, TMP);
667
0
    TMP2A = STATE0A;
668
0
    TMP2B = STATE0B;
669
0
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
670
0
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
671
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
672
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
673
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
674
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
675
0
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
676
0
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
677
678
    // Transform 3: Rounds 5-8
679
0
    TMP = vld1q_u32(&K[4]);
680
0
    TMP0A = vaddq_u32(MSG1A, TMP);
681
0
    TMP0B = vaddq_u32(MSG1B, TMP);
682
0
    TMP2A = STATE0A;
683
0
    TMP2B = STATE0B;
684
0
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
685
0
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
686
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
687
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
688
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
689
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
690
0
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
691
0
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
692
693
    // Transform 3: Rounds 9-12
694
0
    TMP = vld1q_u32(&FINS[0]);
695
0
    TMP2A = STATE0A;
696
0
    TMP2B = STATE0B;
697
0
    MSG2A = vld1q_u32(&FINS[4]);
698
0
    MSG2B = MSG2A;
699
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
700
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
701
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
702
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
703
0
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
704
0
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
705
706
    // Transform 3: Rounds 13-16
707
0
    TMP = vld1q_u32(&FINS[8]);
708
0
    TMP2A = STATE0A;
709
0
    TMP2B = STATE0B;
710
0
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
711
0
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
712
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
713
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
714
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
715
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
716
0
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
717
0
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
718
719
    // Transform 3: Rounds 17-20
720
0
    TMP = vld1q_u32(&K[16]);
721
0
    TMP0A = vaddq_u32(MSG0A, TMP);
722
0
    TMP0B = vaddq_u32(MSG0B, TMP);
723
0
    TMP2A = STATE0A;
724
0
    TMP2B = STATE0B;
725
0
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
726
0
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
727
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
728
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
729
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
730
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
731
0
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
732
0
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
733
734
    // Transform 3: Rounds 21-24
735
0
    TMP = vld1q_u32(&K[20]);
736
0
    TMP0A = vaddq_u32(MSG1A, TMP);
737
0
    TMP0B = vaddq_u32(MSG1B, TMP);
738
0
    TMP2A = STATE0A;
739
0
    TMP2B = STATE0B;
740
0
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
741
0
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
742
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
743
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
744
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
745
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
746
0
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
747
0
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
748
749
    // Transform 3: Rounds 25-28
750
0
    TMP = vld1q_u32(&K[24]);
751
0
    TMP0A = vaddq_u32(MSG2A, TMP);
752
0
    TMP0B = vaddq_u32(MSG2B, TMP);
753
0
    TMP2A = STATE0A;
754
0
    TMP2B = STATE0B;
755
0
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
756
0
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
757
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
758
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
759
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
760
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
761
0
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
762
0
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
763
764
    // Transform 3: Rounds 29-32
765
0
    TMP = vld1q_u32(&K[28]);
766
0
    TMP0A = vaddq_u32(MSG3A, TMP);
767
0
    TMP0B = vaddq_u32(MSG3B, TMP);
768
0
    TMP2A = STATE0A;
769
0
    TMP2B = STATE0B;
770
0
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
771
0
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
772
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
773
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
774
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
775
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
776
0
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
777
0
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
778
779
    // Transform 3: Rounds 33-36
780
0
    TMP = vld1q_u32(&K[32]);
781
0
    TMP0A = vaddq_u32(MSG0A, TMP);
782
0
    TMP0B = vaddq_u32(MSG0B, TMP);
783
0
    TMP2A = STATE0A;
784
0
    TMP2B = STATE0B;
785
0
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
786
0
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
787
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
788
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
789
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
790
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
791
0
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
792
0
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
793
794
    // Transform 3: Rounds 37-40
795
0
    TMP = vld1q_u32(&K[36]);
796
0
    TMP0A = vaddq_u32(MSG1A, TMP);
797
0
    TMP0B = vaddq_u32(MSG1B, TMP);
798
0
    TMP2A = STATE0A;
799
0
    TMP2B = STATE0B;
800
0
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
801
0
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
802
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
803
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
804
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
805
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
806
0
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
807
0
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
808
809
    // Transform 3: Rounds 41-44
810
0
    TMP = vld1q_u32(&K[40]);
811
0
    TMP0A = vaddq_u32(MSG2A, TMP);
812
0
    TMP0B = vaddq_u32(MSG2B, TMP);
813
0
    TMP2A = STATE0A;
814
0
    TMP2B = STATE0B;
815
0
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
816
0
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
817
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
818
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
819
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
820
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
821
0
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
822
0
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
823
824
    // Transform 3: Rounds 45-48
825
0
    TMP = vld1q_u32(&K[44]);
826
0
    TMP0A = vaddq_u32(MSG3A, TMP);
827
0
    TMP0B = vaddq_u32(MSG3B, TMP);
828
0
    TMP2A = STATE0A;
829
0
    TMP2B = STATE0B;
830
0
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
831
0
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
832
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
833
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
834
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
835
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
836
0
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
837
0
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
838
839
    // Transform 3: Rounds 49-52
840
0
    TMP = vld1q_u32(&K[48]);
841
0
    TMP0A = vaddq_u32(MSG0A, TMP);
842
0
    TMP0B = vaddq_u32(MSG0B, TMP);
843
0
    TMP2A = STATE0A;
844
0
    TMP2B = STATE0B;
845
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
846
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
847
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
848
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
849
850
    // Transform 3: Rounds 53-56
851
0
    TMP = vld1q_u32(&K[52]);
852
0
    TMP0A = vaddq_u32(MSG1A, TMP);
853
0
    TMP0B = vaddq_u32(MSG1B, TMP);
854
0
    TMP2A = STATE0A;
855
0
    TMP2B = STATE0B;
856
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
857
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
858
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
859
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
860
861
    // Transform 3: Rounds 57-60
862
0
    TMP = vld1q_u32(&K[56]);
863
0
    TMP0A = vaddq_u32(MSG2A, TMP);
864
0
    TMP0B = vaddq_u32(MSG2B, TMP);
865
0
    TMP2A = STATE0A;
866
0
    TMP2B = STATE0B;
867
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
868
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
869
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
870
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
871
872
    // Transform 3: Rounds 61-64
873
0
    TMP = vld1q_u32(&K[60]);
874
0
    TMP0A = vaddq_u32(MSG3A, TMP);
875
0
    TMP0B = vaddq_u32(MSG3B, TMP);
876
0
    TMP2A = STATE0A;
877
0
    TMP2B = STATE0B;
878
0
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
879
0
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
880
0
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
881
0
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
882
883
    // Transform 3: Update state
884
0
    TMP = vld1q_u32(&INIT[0]);
885
0
    STATE0A = vaddq_u32(STATE0A, TMP);
886
0
    STATE0B = vaddq_u32(STATE0B, TMP);
887
0
    TMP = vld1q_u32(&INIT[4]);
888
0
    STATE1A = vaddq_u32(STATE1A, TMP);
889
0
    STATE1B = vaddq_u32(STATE1B, TMP);
890
891
    // Store result
892
0
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
893
0
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
894
0
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
895
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
896
0
}
897
}
898
899
#endif