Crypto++  8.0
Free C++ class library of cryptographic schemes
sha_simd.cpp
1 // sha_simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to SHA-NI and
5 // ARMv8a SHA instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "sha.h"
12 #include "misc.h"
13 
14 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
15 # undef CRYPTOPP_X86_ASM_AVAILABLE
16 # undef CRYPTOPP_X32_ASM_AVAILABLE
17 # undef CRYPTOPP_X64_ASM_AVAILABLE
18 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
19 #endif
20 
21 #if (CRYPTOPP_SHANI_AVAILABLE)
22 # include <nmmintrin.h>
23 # include <immintrin.h>
24 #endif
25 
26 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
27 # include <arm_neon.h>
28 #endif
29 
30 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
31 # include <stdint.h>
32 # include <arm_acle.h>
33 #endif
34 
35 #if CRYPTOPP_POWER8_SHA_AVAILABLE
36 # include "ppc_simd.h"
37 #endif
38 
39 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
40 # include <signal.h>
41 # include <setjmp.h>
42 #endif
43 
44 #ifndef EXCEPTION_EXECUTE_HANDLER
45 # define EXCEPTION_EXECUTE_HANDLER 1
46 #endif
47 
48 // Clang __m128i casts
49 #define M128_CAST(x) ((__m128i *)(void *)(x))
50 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
51 
52 // Squash MS LNK4221 and libtool warnings
53 extern const char SHA_SIMD_FNAME[] = __FILE__;
54 
55 NAMESPACE_BEGIN(CryptoPP)
56 
57 // ***************** SHA key tables ********************
58 
59 extern const word32 SHA256_K[64];
60 extern const word64 SHA512_K[80];
61 
62 // ***************** SIGILL probes ********************
63 
64 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
65 extern "C" {
66  typedef void (*SigHandler)(int);
67 
68  static jmp_buf s_jmpSIGILL;
69  static void SigIllHandler(int)
70  {
71  longjmp(s_jmpSIGILL, 1);
72  }
73 }
74 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
75 
76 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
77 bool CPU_ProbeSHA1()
78 {
79 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
80  return false;
81 #elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
82 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
83  volatile bool result = true;
84  __try
85  {
86  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
87 
88  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
89  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
90  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
91  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
92  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
93 
94  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
95  }
96  __except (EXCEPTION_EXECUTE_HANDLER)
97  {
98  return false;
99  }
100  return result;
101 # else
102 
103  // longjmp and clobber warnings. Volatile is required.
104  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
105  volatile bool result = true;
106 
107  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
108  if (oldHandler == SIG_ERR)
109  return false;
110 
111  volatile sigset_t oldMask;
112  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
113  return false;
114 
115  if (setjmp(s_jmpSIGILL))
116  result = false;
117  else
118  {
119  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
120 
121  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
122  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
123  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
124  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
125  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
126 
127  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
128  }
129 
130  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
131  signal(SIGILL, oldHandler);
132  return result;
133 # endif
134 #else
135  return false;
136 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
137 }
138 
139 bool CPU_ProbeSHA2()
140 {
141 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
142  return false;
143 #elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
144 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
145  volatile bool result = true;
146  __try
147  {
148  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
149 
150  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
151  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
152  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
153  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
154 
155  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
156  }
157  __except (EXCEPTION_EXECUTE_HANDLER)
158  {
159  return false;
160  }
161  return result;
162 #else
163 
164  // longjmp and clobber warnings. Volatile is required.
165  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
166  volatile bool result = true;
167 
168  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
169  if (oldHandler == SIG_ERR)
170  return false;
171 
172  volatile sigset_t oldMask;
173  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
174  return false;
175 
176  if (setjmp(s_jmpSIGILL))
177  result = false;
178  else
179  {
180  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
181 
182  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
183  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
184  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
185  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
186 
187  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
188  }
189 
190  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
191  signal(SIGILL, oldHandler);
192  return result;
193 # endif
194 #else
195  return false;
196 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
197 }
198 #endif // ARM32 or ARM64
199 
200 // ***************** Intel x86 SHA ********************
201 
202 /////////////////////////////////////
203 // start of Walton and Gulley code //
204 /////////////////////////////////////
205 
206 #if CRYPTOPP_SHANI_AVAILABLE
207 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
208 void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
209 {
210  CRYPTOPP_ASSERT(state);
211  CRYPTOPP_ASSERT(data);
212  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
213 
214  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
215  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
216 
217  // Load initial values
218  ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
219  E0 = _mm_set_epi32(state[4], 0, 0, 0);
220  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
221 
222  // IA-32 SHA is little endian, SHA::Transform is big endian,
223  // and SHA::HashMultipleBlocks can be either. ByteOrder
224  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
225  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
226  _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
227  _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
228 
229  while (length >= SHA1::BLOCKSIZE)
230  {
231  // Save current hash
232  ABCD_SAVE = ABCD;
233  E0_SAVE = E0;
234 
235  // Rounds 0-3
236  MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
237  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
238  E0 = _mm_add_epi32(E0, MSG0);
239  E1 = ABCD;
240  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
241 
242  // Rounds 4-7
243  MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
244  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
245  E1 = _mm_sha1nexte_epu32(E1, MSG1);
246  E0 = ABCD;
247  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
248  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
249 
250  // Rounds 8-11
251  MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
252  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
253  E0 = _mm_sha1nexte_epu32(E0, MSG2);
254  E1 = ABCD;
255  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
256  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
257  MSG0 = _mm_xor_si128(MSG0, MSG2);
258 
259  // Rounds 12-15
260  MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
261  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
262  E1 = _mm_sha1nexte_epu32(E1, MSG3);
263  E0 = ABCD;
264  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
265  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
266  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
267  MSG1 = _mm_xor_si128(MSG1, MSG3);
268 
269  // Rounds 16-19
270  E0 = _mm_sha1nexte_epu32(E0, MSG0);
271  E1 = ABCD;
272  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
273  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
274  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
275  MSG2 = _mm_xor_si128(MSG2, MSG0);
276 
277  // Rounds 20-23
278  E1 = _mm_sha1nexte_epu32(E1, MSG1);
279  E0 = ABCD;
280  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
281  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
282  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
283  MSG3 = _mm_xor_si128(MSG3, MSG1);
284 
285  // Rounds 24-27
286  E0 = _mm_sha1nexte_epu32(E0, MSG2);
287  E1 = ABCD;
288  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
289  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
290  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
291  MSG0 = _mm_xor_si128(MSG0, MSG2);
292 
293  // Rounds 28-31
294  E1 = _mm_sha1nexte_epu32(E1, MSG3);
295  E0 = ABCD;
296  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
297  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
298  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
299  MSG1 = _mm_xor_si128(MSG1, MSG3);
300 
301  // Rounds 32-35
302  E0 = _mm_sha1nexte_epu32(E0, MSG0);
303  E1 = ABCD;
304  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
305  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
306  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
307  MSG2 = _mm_xor_si128(MSG2, MSG0);
308 
309  // Rounds 36-39
310  E1 = _mm_sha1nexte_epu32(E1, MSG1);
311  E0 = ABCD;
312  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
313  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
314  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
315  MSG3 = _mm_xor_si128(MSG3, MSG1);
316 
317  // Rounds 40-43
318  E0 = _mm_sha1nexte_epu32(E0, MSG2);
319  E1 = ABCD;
320  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
321  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
322  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
323  MSG0 = _mm_xor_si128(MSG0, MSG2);
324 
325  // Rounds 44-47
326  E1 = _mm_sha1nexte_epu32(E1, MSG3);
327  E0 = ABCD;
328  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
329  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
330  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
331  MSG1 = _mm_xor_si128(MSG1, MSG3);
332 
333  // Rounds 48-51
334  E0 = _mm_sha1nexte_epu32(E0, MSG0);
335  E1 = ABCD;
336  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
337  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
338  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
339  MSG2 = _mm_xor_si128(MSG2, MSG0);
340 
341  // Rounds 52-55
342  E1 = _mm_sha1nexte_epu32(E1, MSG1);
343  E0 = ABCD;
344  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
345  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
346  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
347  MSG3 = _mm_xor_si128(MSG3, MSG1);
348 
349  // Rounds 56-59
350  E0 = _mm_sha1nexte_epu32(E0, MSG2);
351  E1 = ABCD;
352  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
353  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
354  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
355  MSG0 = _mm_xor_si128(MSG0, MSG2);
356 
357  // Rounds 60-63
358  E1 = _mm_sha1nexte_epu32(E1, MSG3);
359  E0 = ABCD;
360  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
361  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
362  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
363  MSG1 = _mm_xor_si128(MSG1, MSG3);
364 
365  // Rounds 64-67
366  E0 = _mm_sha1nexte_epu32(E0, MSG0);
367  E1 = ABCD;
368  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
369  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
370  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
371  MSG2 = _mm_xor_si128(MSG2, MSG0);
372 
373  // Rounds 68-71
374  E1 = _mm_sha1nexte_epu32(E1, MSG1);
375  E0 = ABCD;
376  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
377  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
378  MSG3 = _mm_xor_si128(MSG3, MSG1);
379 
380  // Rounds 72-75
381  E0 = _mm_sha1nexte_epu32(E0, MSG2);
382  E1 = ABCD;
383  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
384  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
385 
386  // Rounds 76-79
387  E1 = _mm_sha1nexte_epu32(E1, MSG3);
388  E0 = ABCD;
389  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
390 
391  // Add values back to state
392  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
393  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
394 
395  data += SHA1::BLOCKSIZE/sizeof(word32);
396  length -= SHA1::BLOCKSIZE;
397  }
398 
399  // Save state
400  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
401  _mm_storeu_si128(M128_CAST(state), ABCD);
402  state[4] = _mm_extract_epi32(E0, 3);
403 }
404 
405 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
406 void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
407 {
408  CRYPTOPP_ASSERT(state);
409  CRYPTOPP_ASSERT(data);
410  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
411 
412  __m128i STATE0, STATE1;
413  __m128i MSG, TMP, MASK;
414  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
415  __m128i ABEF_SAVE, CDGH_SAVE;
416 
417  // Load initial values
418  TMP = _mm_loadu_si128(M128_CAST(&state[0]));
419  STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
420 
421  // IA-32 SHA is little endian, SHA::Transform is big endian,
422  // and SHA::HashMultipleBlocks can be either. ByteOrder
423  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
424  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
425  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
426  _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
427 
428  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
429  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
430  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
431  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
432 
433  while (length >= SHA256::BLOCKSIZE)
434  {
435  // Save current hash
436  ABEF_SAVE = STATE0;
437  CDGH_SAVE = STATE1;
438 
439  // Rounds 0-3
440  MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
441  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
442  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
443  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
444  MSG = _mm_shuffle_epi32(MSG, 0x0E);
445  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
446 
447  // Rounds 4-7
448  TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
449  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
450  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
451  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
452  MSG = _mm_shuffle_epi32(MSG, 0x0E);
453  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
454  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
455 
456  // Rounds 8-11
457  TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
458  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
459  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
460  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
461  MSG = _mm_shuffle_epi32(MSG, 0x0E);
462  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
463  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
464 
465  // Rounds 12-15
466  TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
467  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
468  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
469  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
470  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
471  TMSG0 = _mm_add_epi32(TMSG0, TMP);
472  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
473  MSG = _mm_shuffle_epi32(MSG, 0x0E);
474  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
475  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
476 
477  // Rounds 16-19
478  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
479  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
480  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
481  TMSG1 = _mm_add_epi32(TMSG1, TMP);
482  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
483  MSG = _mm_shuffle_epi32(MSG, 0x0E);
484  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
485  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
486 
487  // Rounds 20-23
488  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
489  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
490  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
491  TMSG2 = _mm_add_epi32(TMSG2, TMP);
492  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
493  MSG = _mm_shuffle_epi32(MSG, 0x0E);
494  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
495  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
496 
497  // Rounds 24-27
498  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
499  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
500  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
501  TMSG3 = _mm_add_epi32(TMSG3, TMP);
502  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
503  MSG = _mm_shuffle_epi32(MSG, 0x0E);
504  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
505  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
506 
507  // Rounds 28-31
508  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
509  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
510  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
511  TMSG0 = _mm_add_epi32(TMSG0, TMP);
512  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
513  MSG = _mm_shuffle_epi32(MSG, 0x0E);
514  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
515  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
516 
517  // Rounds 32-35
518  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
519  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
520  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
521  TMSG1 = _mm_add_epi32(TMSG1, TMP);
522  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
523  MSG = _mm_shuffle_epi32(MSG, 0x0E);
524  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
525  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
526 
527  // Rounds 36-39
528  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
529  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
530  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
531  TMSG2 = _mm_add_epi32(TMSG2, TMP);
532  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
533  MSG = _mm_shuffle_epi32(MSG, 0x0E);
534  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
535  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
536 
537  // Rounds 40-43
538  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
539  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
540  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
541  TMSG3 = _mm_add_epi32(TMSG3, TMP);
542  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
543  MSG = _mm_shuffle_epi32(MSG, 0x0E);
544  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
545  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
546 
547  // Rounds 44-47
548  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
549  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
550  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
551  TMSG0 = _mm_add_epi32(TMSG0, TMP);
552  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
553  MSG = _mm_shuffle_epi32(MSG, 0x0E);
554  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
555  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
556 
557  // Rounds 48-51
558  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
559  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
560  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
561  TMSG1 = _mm_add_epi32(TMSG1, TMP);
562  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
563  MSG = _mm_shuffle_epi32(MSG, 0x0E);
564  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
565  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
566 
567  // Rounds 52-55
568  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
569  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
570  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
571  TMSG2 = _mm_add_epi32(TMSG2, TMP);
572  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
573  MSG = _mm_shuffle_epi32(MSG, 0x0E);
574  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
575 
576  // Rounds 56-59
577  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
578  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
579  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
580  TMSG3 = _mm_add_epi32(TMSG3, TMP);
581  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
582  MSG = _mm_shuffle_epi32(MSG, 0x0E);
583  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
584 
585  // Rounds 60-63
586  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
587  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
588  MSG = _mm_shuffle_epi32(MSG, 0x0E);
589  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
590 
591  // Add values back to state
592  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
593  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
594 
595  data += SHA256::BLOCKSIZE/sizeof(word32);
596  length -= SHA256::BLOCKSIZE;
597  }
598 
599  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
600  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
601  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
602  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
603 
604  // Save state
605  _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
606  _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
607 }
608 #endif // CRYPTOPP_SHANI_AVAILABLE
609 
610 ///////////////////////////////////
611 // end of Walton and Gulley code //
612 ///////////////////////////////////
613 
614 // ***************** ARMV8 SHA ********************
615 
616 /////////////////////////////////////////////////////////////
617 // start of Walton, Schneiders, O'Rourke and Hovsmith code //
618 /////////////////////////////////////////////////////////////
619 
620 #if CRYPTOPP_ARM_SHA1_AVAILABLE
621 void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
622 {
623  CRYPTOPP_ASSERT(state);
624  CRYPTOPP_ASSERT(data);
625  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
626 
627  uint32x4_t C0, C1, C2, C3;
628  uint32x4_t ABCD, ABCD_SAVED;
629  uint32x4_t MSG0, MSG1, MSG2, MSG3;
630  uint32x4_t TMP0, TMP1;
631  uint32_t E0, E0_SAVED, E1;
632 
633  // Load initial values
634  C0 = vdupq_n_u32(0x5A827999);
635  C1 = vdupq_n_u32(0x6ED9EBA1);
636  C2 = vdupq_n_u32(0x8F1BBCDC);
637  C3 = vdupq_n_u32(0xCA62C1D6);
638 
639  ABCD = vld1q_u32(&state[0]);
640  E0 = state[4];
641 
642  while (length >= SHA1::BLOCKSIZE)
643  {
644  // Save current hash
645  ABCD_SAVED = ABCD;
646  E0_SAVED = E0;
647 
648  MSG0 = vld1q_u32(data + 0);
649  MSG1 = vld1q_u32(data + 4);
650  MSG2 = vld1q_u32(data + 8);
651  MSG3 = vld1q_u32(data + 12);
652 
653  if (order == BIG_ENDIAN_ORDER) // Data arrangement
654  {
655  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
656  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
657  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
658  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
659  }
660 
661  TMP0 = vaddq_u32(MSG0, C0);
662  TMP1 = vaddq_u32(MSG1, C0);
663 
664  // Rounds 0-3
665  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
666  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
667  TMP0 = vaddq_u32(MSG2, C0);
668  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
669 
670  // Rounds 4-7
671  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
672  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
673  TMP1 = vaddq_u32(MSG3, C0);
674  MSG0 = vsha1su1q_u32(MSG0, MSG3);
675  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
676 
677  // Rounds 8-11
678  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
679  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
680  TMP0 = vaddq_u32(MSG0, C0);
681  MSG1 = vsha1su1q_u32(MSG1, MSG0);
682  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
683 
684  // Rounds 12-15
685  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
686  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
687  TMP1 = vaddq_u32(MSG1, C1);
688  MSG2 = vsha1su1q_u32(MSG2, MSG1);
689  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
690 
691  // Rounds 16-19
692  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
693  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
694  TMP0 = vaddq_u32(MSG2, C1);
695  MSG3 = vsha1su1q_u32(MSG3, MSG2);
696  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
697 
698  // Rounds 20-23
699  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
700  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
701  TMP1 = vaddq_u32(MSG3, C1);
702  MSG0 = vsha1su1q_u32(MSG0, MSG3);
703  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
704 
705  // Rounds 24-27
706  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
707  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
708  TMP0 = vaddq_u32(MSG0, C1);
709  MSG1 = vsha1su1q_u32(MSG1, MSG0);
710  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
711 
712  // Rounds 28-31
713  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
714  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
715  TMP1 = vaddq_u32(MSG1, C1);
716  MSG2 = vsha1su1q_u32(MSG2, MSG1);
717  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
718 
719  // Rounds 32-35
720  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
721  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
722  TMP0 = vaddq_u32(MSG2, C2);
723  MSG3 = vsha1su1q_u32(MSG3, MSG2);
724  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
725 
726  // Rounds 36-39
727  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
728  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
729  TMP1 = vaddq_u32(MSG3, C2);
730  MSG0 = vsha1su1q_u32(MSG0, MSG3);
731  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
732 
733  // Rounds 40-43
734  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
735  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
736  TMP0 = vaddq_u32(MSG0, C2);
737  MSG1 = vsha1su1q_u32(MSG1, MSG0);
738  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
739 
740  // Rounds 44-47
741  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
742  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
743  TMP1 = vaddq_u32(MSG1, C2);
744  MSG2 = vsha1su1q_u32(MSG2, MSG1);
745  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
746 
747  // Rounds 48-51
748  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
749  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
750  TMP0 = vaddq_u32(MSG2, C2);
751  MSG3 = vsha1su1q_u32(MSG3, MSG2);
752  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
753 
754  // Rounds 52-55
755  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
756  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
757  TMP1 = vaddq_u32(MSG3, C3);
758  MSG0 = vsha1su1q_u32(MSG0, MSG3);
759  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
760 
761  // Rounds 56-59
762  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
763  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
764  TMP0 = vaddq_u32(MSG0, C3);
765  MSG1 = vsha1su1q_u32(MSG1, MSG0);
766  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
767 
768  // Rounds 60-63
769  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
770  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
771  TMP1 = vaddq_u32(MSG1, C3);
772  MSG2 = vsha1su1q_u32(MSG2, MSG1);
773  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
774 
775  // Rounds 64-67
776  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
777  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
778  TMP0 = vaddq_u32(MSG2, C3);
779  MSG3 = vsha1su1q_u32(MSG3, MSG2);
780  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
781 
782  // Rounds 68-71
783  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
784  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
785  TMP1 = vaddq_u32(MSG3, C3);
786  MSG0 = vsha1su1q_u32(MSG0, MSG3);
787 
788  // Rounds 72-75
789  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
790  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
791 
792  // Rounds 76-79
793  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
794  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
795 
796  E0 += E0_SAVED;
797  ABCD = vaddq_u32(ABCD_SAVED, ABCD);
798 
799  data += SHA1::BLOCKSIZE/sizeof(word32);
800  length -= SHA1::BLOCKSIZE;
801  }
802 
803  // Save state
804  vst1q_u32(&state[0], ABCD);
805  state[4] = E0;
806 }
807 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
808 
809 #if CRYPTOPP_ARM_SHA2_AVAILABLE
810 void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
811 {
812  CRYPTOPP_ASSERT(state);
813  CRYPTOPP_ASSERT(data);
814  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
815 
816  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
817  uint32x4_t MSG0, MSG1, MSG2, MSG3;
818  uint32x4_t TMP0, TMP1, TMP2;
819 
820  // Load initial values
821  STATE0 = vld1q_u32(&state[0]);
822  STATE1 = vld1q_u32(&state[4]);
823 
824  while (length >= SHA256::BLOCKSIZE)
825  {
826  // Save current hash
827  ABEF_SAVE = STATE0;
828  CDGH_SAVE = STATE1;
829 
830  // Load message
831  MSG0 = vld1q_u32(data + 0);
832  MSG1 = vld1q_u32(data + 4);
833  MSG2 = vld1q_u32(data + 8);
834  MSG3 = vld1q_u32(data + 12);
835 
836  if (order == BIG_ENDIAN_ORDER) // Data arrangement
837  {
838  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
839  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
840  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
841  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
842  }
843 
844  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
845 
846  // Rounds 0-3
847  MSG0 = vsha256su0q_u32(MSG0, MSG1);
848  TMP2 = STATE0;
849  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
850  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
851  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
852  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
853 
854  // Rounds 4-7
855  MSG1 = vsha256su0q_u32(MSG1, MSG2);
856  TMP2 = STATE0;
857  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
858  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
859  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
860  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
861 
862  // Rounds 8-11
863  MSG2 = vsha256su0q_u32(MSG2, MSG3);
864  TMP2 = STATE0;
865  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
866  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
867  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
868  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
869 
870  // Rounds 12-15
871  MSG3 = vsha256su0q_u32(MSG3, MSG0);
872  TMP2 = STATE0;
873  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
874  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
875  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
876  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
877 
878  // Rounds 16-19
879  MSG0 = vsha256su0q_u32(MSG0, MSG1);
880  TMP2 = STATE0;
881  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
882  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
883  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
884  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
885 
886  // Rounds 20-23
887  MSG1 = vsha256su0q_u32(MSG1, MSG2);
888  TMP2 = STATE0;
889  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
890  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
891  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
892  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
893 
894  // Rounds 24-27
895  MSG2 = vsha256su0q_u32(MSG2, MSG3);
896  TMP2 = STATE0;
897  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
898  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
899  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
900  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
901 
902  // Rounds 28-31
903  MSG3 = vsha256su0q_u32(MSG3, MSG0);
904  TMP2 = STATE0;
905  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
906  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
907  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
908  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
909 
910  // Rounds 32-35
911  MSG0 = vsha256su0q_u32(MSG0, MSG1);
912  TMP2 = STATE0;
913  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
914  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
915  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
916  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
917 
918  // Rounds 36-39
919  MSG1 = vsha256su0q_u32(MSG1, MSG2);
920  TMP2 = STATE0;
921  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
922  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
923  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
924  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
925 
926  // Rounds 40-43
927  MSG2 = vsha256su0q_u32(MSG2, MSG3);
928  TMP2 = STATE0;
929  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
930  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
931  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
932  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
933 
934  // Rounds 44-47
935  MSG3 = vsha256su0q_u32(MSG3, MSG0);
936  TMP2 = STATE0;
937  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
938  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
939  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
940  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
941 
942  // Rounds 48-51
943  TMP2 = STATE0;
944  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
945  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
946  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
947 
948  // Rounds 52-55
949  TMP2 = STATE0;
950  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
951  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
952  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
953 
954  // Rounds 56-59
955  TMP2 = STATE0;
956  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
957  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
958  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
959 
960  // Rounds 60-63
961  TMP2 = STATE0;
962  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
963  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
964 
965  // Add back to state
966  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
967  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
968 
969  data += SHA256::BLOCKSIZE/sizeof(word32);
970  length -= SHA256::BLOCKSIZE;
971  }
972 
973  // Save state
974  vst1q_u32(&state[0], STATE0);
975  vst1q_u32(&state[4], STATE1);
976 }
977 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
978 
979 ///////////////////////////////////////////////////////////
980 // end of Walton, Schneiders, O'Rourke and Hovsmith code //
981 ///////////////////////////////////////////////////////////
982 
983 // ***************** Power8 SHA ********************
984 
985 //////////////////////////////////////////////////
986 // start Gustavo, Serra, Scalet and Walton code //
987 //////////////////////////////////////////////////
988 
989 #if CRYPTOPP_POWER8_SHA_AVAILABLE
990 
991 // Indexes into the S[] array
992 enum {A=0, B=1, C, D, E, F, G, H};
993 
994 inline
995 uint32x4_p VecLoad32(const word32* data, int offset)
996 {
997 #if (CRYPTOPP_LITTLE_ENDIAN)
998  const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
999  const uint32x4_p val = VecLoad(offset, data);
1000  return (uint32x4_p)VecPermute(val, val, mask);
1001 #else
1002  return VecLoad(offset, data);
1003 #endif
1004 }
1005 
1006 template<class T> inline
1007 void VecStore32(const T data, word32 dest[4])
1008 {
1009  VecStore(data, dest);
1010 }
1011 
1012 inline
1013 uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1014 {
1015  // The trick below is due to Andy Polyakov and Jack Lloyd
1016  return vec_sel(z,y,x);
1017 }
1018 
1019 inline
1020 uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1021 {
1022  // The trick below is due to Andy Polyakov and Jack Lloyd
1023  return vec_sel(y, z, VecXor(x, y));
1024 }
1025 
1026 inline
1027 uint32x4_p Vector_sigma0(const uint32x4_p val)
1028 {
1029  return VecSHA256<0,0>(val);
1030 }
1031 
1032 inline
1033 uint32x4_p Vector_sigma1(const uint32x4_p val)
1034 {
1035  return VecSHA256<0,0xf>(val);
1036 }
1037 
1038 inline
1039 uint32x4_p VectorSigma0(const uint32x4_p val)
1040 {
1041  return VecSHA256<1,0>(val);
1042 }
1043 
1044 inline
1045 uint32x4_p VectorSigma1(const uint32x4_p val)
1046 {
1047  return VecSHA256<1,0xf>(val);
1048 }
1049 
1050 inline
1051 uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1052  const uint32x4_p c, const uint32x4_p d)
1053 {
1054  const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1055  const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1056  return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1057 }
1058 
1059 template <unsigned int R> inline
1060 void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1061 {
1062  uint32x4_p T1, T2;
1063 
1064  W[R] = M;
1065  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1066  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1067 
1068  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1069  S[E] = S[D] + T1;
1070  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1071  S[A] = T1 + T2;
1072 }
1073 
1074 template <unsigned int R> inline
1075 void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1076 {
1077  // Indexes into the W[] array
1078  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1079 
1080  const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1081  const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1082 
1083  uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1084  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1085  uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1086 
1087  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1088  S[E] = S[D] + T1;
1089  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1090  S[A] = T1 + T2;
1091 }
1092 
1093 void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1094 {
1095  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1096  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1097  CRYPTOPP_UNUSED(order);
1098 
1099  const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1100  const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1101 
1102  uint32x4_p abcd = VecLoad(state+0);
1103  uint32x4_p efgh = VecLoad(state+4);
1104  uint32x4_p W[16], S[8], vm, vk;
1105 
1106  size_t blocks = length / SHA256::BLOCKSIZE;
1107  while (blocks--)
1108  {
1109  unsigned int offset=0;
1110 
1111  S[A] = abcd; S[E] = efgh;
1112  S[B] = VecShiftLeftOctet<4>(S[A]);
1113  S[F] = VecShiftLeftOctet<4>(S[E]);
1114  S[C] = VecShiftLeftOctet<4>(S[B]);
1115  S[G] = VecShiftLeftOctet<4>(S[F]);
1116  S[D] = VecShiftLeftOctet<4>(S[C]);
1117  S[H] = VecShiftLeftOctet<4>(S[G]);
1118 
1119  // Rounds 0-16
1120  vk = VecLoad(offset, k);
1121  vm = VecLoad32(m, offset);
1122  SHA256_ROUND1<0>(W,S, vk,vm);
1123  offset+=16;
1124 
1125  vk = VecShiftLeftOctet<4>(vk);
1126  vm = VecShiftLeftOctet<4>(vm);
1127  SHA256_ROUND1<1>(W,S, vk,vm);
1128 
1129  vk = VecShiftLeftOctet<4>(vk);
1130  vm = VecShiftLeftOctet<4>(vm);
1131  SHA256_ROUND1<2>(W,S, vk,vm);
1132 
1133  vk = VecShiftLeftOctet<4>(vk);
1134  vm = VecShiftLeftOctet<4>(vm);
1135  SHA256_ROUND1<3>(W,S, vk,vm);
1136 
1137  vk = VecLoad(offset, k);
1138  vm = VecLoad32(m, offset);
1139  SHA256_ROUND1<4>(W,S, vk,vm);
1140  offset+=16;
1141 
1142  vk = VecShiftLeftOctet<4>(vk);
1143  vm = VecShiftLeftOctet<4>(vm);
1144  SHA256_ROUND1<5>(W,S, vk,vm);
1145 
1146  vk = VecShiftLeftOctet<4>(vk);
1147  vm = VecShiftLeftOctet<4>(vm);
1148  SHA256_ROUND1<6>(W,S, vk,vm);
1149 
1150  vk = VecShiftLeftOctet<4>(vk);
1151  vm = VecShiftLeftOctet<4>(vm);
1152  SHA256_ROUND1<7>(W,S, vk,vm);
1153 
1154  vk = VecLoad(offset, k);
1155  vm = VecLoad32(m, offset);
1156  SHA256_ROUND1<8>(W,S, vk,vm);
1157  offset+=16;
1158 
1159  vk = VecShiftLeftOctet<4>(vk);
1160  vm = VecShiftLeftOctet<4>(vm);
1161  SHA256_ROUND1<9>(W,S, vk,vm);
1162 
1163  vk = VecShiftLeftOctet<4>(vk);
1164  vm = VecShiftLeftOctet<4>(vm);
1165  SHA256_ROUND1<10>(W,S, vk,vm);
1166 
1167  vk = VecShiftLeftOctet<4>(vk);
1168  vm = VecShiftLeftOctet<4>(vm);
1169  SHA256_ROUND1<11>(W,S, vk,vm);
1170 
1171  vk = VecLoad(offset, k);
1172  vm = VecLoad32(m, offset);
1173  SHA256_ROUND1<12>(W,S, vk,vm);
1174  offset+=16;
1175 
1176  vk = VecShiftLeftOctet<4>(vk);
1177  vm = VecShiftLeftOctet<4>(vm);
1178  SHA256_ROUND1<13>(W,S, vk,vm);
1179 
1180  vk = VecShiftLeftOctet<4>(vk);
1181  vm = VecShiftLeftOctet<4>(vm);
1182  SHA256_ROUND1<14>(W,S, vk,vm);
1183 
1184  vk = VecShiftLeftOctet<4>(vk);
1185  vm = VecShiftLeftOctet<4>(vm);
1186  SHA256_ROUND1<15>(W,S, vk,vm);
1187 
1188  m += 16; // 32-bit words, not bytes
1189 
1190  // Rounds 16-64
1191  for (unsigned int i=16; i<64; i+=16)
1192  {
1193  vk = VecLoad(offset, k);
1194  SHA256_ROUND2<0>(W,S, vk);
1195  SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1196  SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1197  SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1198  offset+=16;
1199 
1200  vk = VecLoad(offset, k);
1201  SHA256_ROUND2<4>(W,S, vk);
1202  SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1203  SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1204  SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1205  offset+=16;
1206 
1207  vk = VecLoad(offset, k);
1208  SHA256_ROUND2<8>(W,S, vk);
1209  SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1210  SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1211  SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1212  offset+=16;
1213 
1214  vk = VecLoad(offset, k);
1215  SHA256_ROUND2<12>(W,S, vk);
1216  SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1217  SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1218  SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1219  offset+=16;
1220  }
1221 
1222  abcd += VectorPack(S[A],S[B],S[C],S[D]);
1223  efgh += VectorPack(S[E],S[F],S[G],S[H]);
1224  }
1225 
1226  VecStore32(abcd, state+0);
1227  VecStore32(efgh, state+4);
1228 }
1229 
1230 inline
1231 void VecStore64(const uint64x2_p val, word64* data)
1232 {
1233  VecStore(val, data);
1234 }
1235 
1236 inline
1237 uint64x2_p VecLoad64(const word64* data, int offset)
1238 {
1239 #if (CRYPTOPP_LITTLE_ENDIAN)
1240  const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1241  return VecPermute(VecLoad(offset, data), mask);
1242 #else
1243  return VecLoad(offset, data);
1244 #endif
1245 }
1246 
1247 inline
1248 uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1249 {
1250  // The trick below is due to Andy Polyakov and Jack Lloyd
1251  return vec_sel(z,y,x);
1252 }
1253 
1254 inline
1255 uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1256 {
1257  // The trick below is due to Andy Polyakov and Jack Lloyd
1258  return vec_sel(y, z, VecXor(x, y));
1259 }
1260 
1261 inline
1262 uint64x2_p Vector_sigma0(const uint64x2_p val)
1263 {
1264  return VecSHA512<0,0>(val);
1265 }
1266 
1267 inline
1268 uint64x2_p Vector_sigma1(const uint64x2_p val)
1269 {
1270  return VecSHA512<0,0xf>(val);
1271 }
1272 
1273 inline
1274 uint64x2_p VectorSigma0(const uint64x2_p val)
1275 {
1276  return VecSHA512<1,0>(val);
1277 }
1278 
1279 inline
1280 uint64x2_p VectorSigma1(const uint64x2_p val)
1281 {
1282  return VecSHA512<1,0xf>(val);
1283 }
1284 
1285 inline
1286 uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1287 {
1288  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1289  return VecPermute(x,y,m);
1290 }
1291 
1292 template <unsigned int R> inline
1293 void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1294 {
1295  uint64x2_p T1, T2;
1296 
1297  W[R] = M;
1298  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1299  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1300 
1301  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1302  S[E] = S[D] + T1;
1303  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1304  S[A] = T1 + T2;
1305 }
1306 
1307 template <unsigned int R> inline
1308 void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1309 {
1310  // Indexes into the W[] array
1311  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1312 
1313  const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1314  const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1315 
1316  uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1317  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1318  uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1319 
1320  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1321  S[E] = S[D] + T1;
1322  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1323  S[A] = T1 + T2;
1324 }
1325 
1326 void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1327 {
1328  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1329  CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1330  CRYPTOPP_UNUSED(order);
1331 
1332  const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1333  const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1334 
1335  uint64x2_p ab = VecLoad(state+0);
1336  uint64x2_p cd = VecLoad(state+2);
1337  uint64x2_p ef = VecLoad(state+4);
1338  uint64x2_p gh = VecLoad(state+6);
1339  uint64x2_p W[16], S[8], vm, vk;
1340 
1341  size_t blocks = length / SHA512::BLOCKSIZE;
1342  while (blocks--)
1343  {
1344  unsigned int offset=0;
1345 
1346  S[A] = ab; S[C] = cd;
1347  S[E] = ef; S[G] = gh;
1348  S[B] = VecShiftLeftOctet<8>(S[A]);
1349  S[D] = VecShiftLeftOctet<8>(S[C]);
1350  S[F] = VecShiftLeftOctet<8>(S[E]);
1351  S[H] = VecShiftLeftOctet<8>(S[G]);
1352 
1353  // Rounds 0-16
1354  vk = VecLoad(offset, k);
1355  vm = VecLoad64(m, offset);
1356  SHA512_ROUND1<0>(W,S, vk,vm);
1357  offset+=16;
1358 
1359  vk = VecShiftLeftOctet<8>(vk);
1360  vm = VecShiftLeftOctet<8>(vm);
1361  SHA512_ROUND1<1>(W,S, vk,vm);
1362 
1363  vk = VecLoad(offset, k);
1364  vm = VecLoad64(m, offset);
1365  SHA512_ROUND1<2>(W,S, vk,vm);
1366  offset+=16;
1367 
1368  vk = VecShiftLeftOctet<8>(vk);
1369  vm = VecShiftLeftOctet<8>(vm);
1370  SHA512_ROUND1<3>(W,S, vk,vm);
1371 
1372  vk = VecLoad(offset, k);
1373  vm = VecLoad64(m, offset);
1374  SHA512_ROUND1<4>(W,S, vk,vm);
1375  offset+=16;
1376 
1377  vk = VecShiftLeftOctet<8>(vk);
1378  vm = VecShiftLeftOctet<8>(vm);
1379  SHA512_ROUND1<5>(W,S, vk,vm);
1380 
1381  vk = VecLoad(offset, k);
1382  vm = VecLoad64(m, offset);
1383  SHA512_ROUND1<6>(W,S, vk,vm);
1384  offset+=16;
1385 
1386  vk = VecShiftLeftOctet<8>(vk);
1387  vm = VecShiftLeftOctet<8>(vm);
1388  SHA512_ROUND1<7>(W,S, vk,vm);
1389 
1390  vk = VecLoad(offset, k);
1391  vm = VecLoad64(m, offset);
1392  SHA512_ROUND1<8>(W,S, vk,vm);
1393  offset+=16;
1394 
1395  vk = VecShiftLeftOctet<8>(vk);
1396  vm = VecShiftLeftOctet<8>(vm);
1397  SHA512_ROUND1<9>(W,S, vk,vm);
1398 
1399  vk = VecLoad(offset, k);
1400  vm = VecLoad64(m, offset);
1401  SHA512_ROUND1<10>(W,S, vk,vm);
1402  offset+=16;
1403 
1404  vk = VecShiftLeftOctet<8>(vk);
1405  vm = VecShiftLeftOctet<8>(vm);
1406  SHA512_ROUND1<11>(W,S, vk,vm);
1407 
1408  vk = VecLoad(offset, k);
1409  vm = VecLoad64(m, offset);
1410  SHA512_ROUND1<12>(W,S, vk,vm);
1411  offset+=16;
1412 
1413  vk = VecShiftLeftOctet<8>(vk);
1414  vm = VecShiftLeftOctet<8>(vm);
1415  SHA512_ROUND1<13>(W,S, vk,vm);
1416 
1417  vk = VecLoad(offset, k);
1418  vm = VecLoad64(m, offset);
1419  SHA512_ROUND1<14>(W,S, vk,vm);
1420  offset+=16;
1421 
1422  vk = VecShiftLeftOctet<8>(vk);
1423  vm = VecShiftLeftOctet<8>(vm);
1424  SHA512_ROUND1<15>(W,S, vk,vm);
1425 
1426  m += 16; // 64-bit words, not bytes
1427 
1428  // Rounds 16-80
1429  for (unsigned int i=16; i<80; i+=16)
1430  {
1431  vk = VecLoad(offset, k);
1432  SHA512_ROUND2<0>(W,S, vk);
1433  SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1434  offset+=16;
1435 
1436  vk = VecLoad(offset, k);
1437  SHA512_ROUND2<2>(W,S, vk);
1438  SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1439  offset+=16;
1440 
1441  vk = VecLoad(offset, k);
1442  SHA512_ROUND2<4>(W,S, vk);
1443  SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1444  offset+=16;
1445 
1446  vk = VecLoad(offset, k);
1447  SHA512_ROUND2<6>(W,S, vk);
1448  SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1449  offset+=16;
1450 
1451  vk = VecLoad(offset, k);
1452  SHA512_ROUND2<8>(W,S, vk);
1453  SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1454  offset+=16;
1455 
1456  vk = VecLoad(offset, k);
1457  SHA512_ROUND2<10>(W,S, vk);
1458  SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1459  offset+=16;
1460 
1461  vk = VecLoad(offset, k);
1462  SHA512_ROUND2<12>(W,S, vk);
1463  SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1464  offset+=16;
1465 
1466  vk = VecLoad(offset, k);
1467  SHA512_ROUND2<14>(W,S, vk);
1468  SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1469  offset+=16;
1470  }
1471 
1472  ab += VectorPack(S[A],S[B]);
1473  cd += VectorPack(S[C],S[D]);
1474  ef += VectorPack(S[E],S[F]);
1475  gh += VectorPack(S[G],S[H]);
1476  }
1477 
1478  VecStore64(ab, state+0);
1479  VecStore64(cd, state+2);
1480  VecStore64(ef, state+4);
1481  VecStore64(gh, state+6);
1482 }
1483 
1484 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1485 
1486 ////////////////////////////////////////////////
1487 // end Gustavo, Serra, Scalet and Walton code //
1488 ////////////////////////////////////////////////
1489 
1490 NAMESPACE_END
Utility functions for the Crypto++ library.
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Precompiled header file.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:600
byte order is big-endian
Definition: cryptlib.h:147
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
Classes for SHA-1 and SHA-2 family of message digests.
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:138
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:251
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118