Crypto++  8.0
Free C++ class library of cryptographic schemes
blake2s_simd.cpp
1 // blake2-simd.cpp - written and placed in the public domain by
2 // Samuel Neves, Jeffrey Walton, Uri Blumenthal
3 // and Marcel Raad.
4 //
5 // This source file uses intrinsics to gain access to ARMv7a/ARMv8a
6 // NEON, Power7 and SSE4.1 instructions. A separate source file is
7 // needed because additional CXXFLAGS are required to enable the
8 // appropriate instructions sets in some build configurations.
9 
10 // The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
11 // numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
12 // and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
13 // Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
14 // above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
15 // or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
16 // then the PowerPC code will be slower than C++. Be sure to use GCC 5.0
17 // or above for PowerPC builds or disable Altivec for BLAKE2b and BLAKE2s
18 // if using the old compilers.
19 
20 #include "pch.h"
21 #include "config.h"
22 #include "misc.h"
23 #include "blake2.h"
24 
25 // Uncomment for benchmarking C++ against SSE2 or NEON.
26 // Do so in both blake2.cpp and blake2-simd.cpp.
27 // #undef CRYPTOPP_SSE41_AVAILABLE
28 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
29 // #undef CRYPTOPP_ALTIVEC_AVAILABLE
30 
31 // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
32 // 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
33 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
34 # undef CRYPTOPP_ARM_NEON_AVAILABLE
35 #endif
36 
37 // BLAKE2s bug on AIX 7.1 (POWER7) with XLC 12.01
38 // https://github.com/weidai11/cryptopp/issues/743
39 #if defined(__xlC__) && (__xlC__ < 0x0d01)
40 # define CRYPTOPP_DISABLE_ALTIVEC 1
41 # define CRYPTOPP_POWER7_ALTIVEC 1
42 # undef CRYPTOPP_POWER7_AVAILABLE
43 # undef CRYPTOPP_ALTIVEC_AVAILABLE
44 #endif
45 
46 #if (CRYPTOPP_SSE41_AVAILABLE)
47 # include <emmintrin.h>
48 # include <tmmintrin.h>
49 # include <smmintrin.h>
50 #endif
51 
52 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
53 # include <arm_neon.h>
54 #endif
55 
56 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
57 // compilers don't follow ACLE conventions for the include.
58 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
59 # include <stdint.h>
60 # include <arm_acle.h>
61 #endif
62 
63 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
64 # include "ppc_simd.h"
65 #endif
66 
67 // Squash MS LNK4221 and libtool warnings
68 extern const char BLAKE2S_SIMD_FNAME[] = __FILE__;
69 
70 NAMESPACE_BEGIN(CryptoPP)
71 
72 // Exported by blake2.cpp
73 extern const word32 BLAKE2S_IV[8];
74 extern const word64 BLAKE2B_IV[8];
75 
76 #if CRYPTOPP_SSE41_AVAILABLE
77 
78 #define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p))
79 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r)
80 #define TOF(reg) _mm_castsi128_ps((reg))
81 #define TOI(reg) _mm_castps_si128((reg))
82 
83 void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state)
84 {
85  #define BLAKE2S_LOAD_MSG_0_1(buf) \
86  buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
87 
88  #define BLAKE2S_LOAD_MSG_0_2(buf) \
89  buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
90 
91  #define BLAKE2S_LOAD_MSG_0_3(buf) \
92  buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
93 
94  #define BLAKE2S_LOAD_MSG_0_4(buf) \
95  buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
96 
97  #define BLAKE2S_LOAD_MSG_1_1(buf) \
98  t0 = _mm_blend_epi16(m1, m2, 0x0C); \
99  t1 = _mm_slli_si128(m3, 4); \
100  t2 = _mm_blend_epi16(t0, t1, 0xF0); \
101  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
102 
103  #define BLAKE2S_LOAD_MSG_1_2(buf) \
104  t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
105  t1 = _mm_blend_epi16(m1,m3,0xC0); \
106  t2 = _mm_blend_epi16(t0, t1, 0xF0); \
107  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
108 
109  #define BLAKE2S_LOAD_MSG_1_3(buf) \
110  t0 = _mm_slli_si128(m1, 4); \
111  t1 = _mm_blend_epi16(m2, t0, 0x30); \
112  t2 = _mm_blend_epi16(m0, t1, 0xF0); \
113  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
114 
115  #define BLAKE2S_LOAD_MSG_1_4(buf) \
116  t0 = _mm_unpackhi_epi32(m0,m1); \
117  t1 = _mm_slli_si128(m3, 4); \
118  t2 = _mm_blend_epi16(t0, t1, 0x0C); \
119  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
120 
121  #define BLAKE2S_LOAD_MSG_2_1(buf) \
122  t0 = _mm_unpackhi_epi32(m2,m3); \
123  t1 = _mm_blend_epi16(m3,m1,0x0C); \
124  t2 = _mm_blend_epi16(t0, t1, 0x0F); \
125  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
126 
127  #define BLAKE2S_LOAD_MSG_2_2(buf) \
128  t0 = _mm_unpacklo_epi32(m2,m0); \
129  t1 = _mm_blend_epi16(t0, m0, 0xF0); \
130  t2 = _mm_slli_si128(m3, 8); \
131  buf = _mm_blend_epi16(t1, t2, 0xC0);
132 
133  #define BLAKE2S_LOAD_MSG_2_3(buf) \
134  t0 = _mm_blend_epi16(m0, m2, 0x3C); \
135  t1 = _mm_srli_si128(m1, 12); \
136  t2 = _mm_blend_epi16(t0,t1,0x03); \
137  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
138 
139  #define BLAKE2S_LOAD_MSG_2_4(buf) \
140  t0 = _mm_slli_si128(m3, 4); \
141  t1 = _mm_blend_epi16(m0, m1, 0x33); \
142  t2 = _mm_blend_epi16(t1, t0, 0xC0); \
143  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
144 
145  #define BLAKE2S_LOAD_MSG_3_1(buf) \
146  t0 = _mm_unpackhi_epi32(m0,m1); \
147  t1 = _mm_unpackhi_epi32(t0, m2); \
148  t2 = _mm_blend_epi16(t1, m3, 0x0C); \
149  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
150 
151  #define BLAKE2S_LOAD_MSG_3_2(buf) \
152  t0 = _mm_slli_si128(m2, 8); \
153  t1 = _mm_blend_epi16(m3,m0,0x0C); \
154  t2 = _mm_blend_epi16(t1, t0, 0xC0); \
155  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
156 
157  #define BLAKE2S_LOAD_MSG_3_3(buf) \
158  t0 = _mm_blend_epi16(m0,m1,0x0F); \
159  t1 = _mm_blend_epi16(t0, m3, 0xC0); \
160  buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
161 
162  #define BLAKE2S_LOAD_MSG_3_4(buf) \
163  t0 = _mm_unpacklo_epi32(m0,m2); \
164  t1 = _mm_unpackhi_epi32(m1,m2); \
165  buf = _mm_unpacklo_epi64(t1,t0);
166 
167  #define BLAKE2S_LOAD_MSG_4_1(buf) \
168  t0 = _mm_unpacklo_epi64(m1,m2); \
169  t1 = _mm_unpackhi_epi64(m0,m2); \
170  t2 = _mm_blend_epi16(t0,t1,0x33); \
171  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
172 
173  #define BLAKE2S_LOAD_MSG_4_2(buf) \
174  t0 = _mm_unpackhi_epi64(m1,m3); \
175  t1 = _mm_unpacklo_epi64(m0,m1); \
176  buf = _mm_blend_epi16(t0,t1,0x33);
177 
178  #define BLAKE2S_LOAD_MSG_4_3(buf) \
179  t0 = _mm_unpackhi_epi64(m3,m1); \
180  t1 = _mm_unpackhi_epi64(m2,m0); \
181  buf = _mm_blend_epi16(t1,t0,0x33);
182 
183  #define BLAKE2S_LOAD_MSG_4_4(buf) \
184  t0 = _mm_blend_epi16(m0,m2,0x03); \
185  t1 = _mm_slli_si128(t0, 8); \
186  t2 = _mm_blend_epi16(t1,m3,0x0F); \
187  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
188 
189  #define BLAKE2S_LOAD_MSG_5_1(buf) \
190  t0 = _mm_unpackhi_epi32(m0,m1); \
191  t1 = _mm_unpacklo_epi32(m0,m2); \
192  buf = _mm_unpacklo_epi64(t0,t1);
193 
194  #define BLAKE2S_LOAD_MSG_5_2(buf) \
195  t0 = _mm_srli_si128(m2, 4); \
196  t1 = _mm_blend_epi16(m0,m3,0x03); \
197  buf = _mm_blend_epi16(t1,t0,0x3C);
198 
199  #define BLAKE2S_LOAD_MSG_5_3(buf) \
200  t0 = _mm_blend_epi16(m1,m0,0x0C); \
201  t1 = _mm_srli_si128(m3, 4); \
202  t2 = _mm_blend_epi16(t0,t1,0x30); \
203  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
204 
205  #define BLAKE2S_LOAD_MSG_5_4(buf) \
206  t0 = _mm_unpacklo_epi64(m1,m2); \
207  t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
208  buf = _mm_blend_epi16(t0,t1,0x33);
209 
210  #define BLAKE2S_LOAD_MSG_6_1(buf) \
211  t0 = _mm_slli_si128(m1, 12); \
212  t1 = _mm_blend_epi16(m0,m3,0x33); \
213  buf = _mm_blend_epi16(t1,t0,0xC0);
214 
215  #define BLAKE2S_LOAD_MSG_6_2(buf) \
216  t0 = _mm_blend_epi16(m3,m2,0x30); \
217  t1 = _mm_srli_si128(m1, 4); \
218  t2 = _mm_blend_epi16(t0,t1,0x03); \
219  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
220 
221  #define BLAKE2S_LOAD_MSG_6_3(buf) \
222  t0 = _mm_unpacklo_epi64(m0,m2); \
223  t1 = _mm_srli_si128(m1, 4); \
224  buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
225 
226  #define BLAKE2S_LOAD_MSG_6_4(buf) \
227  t0 = _mm_unpackhi_epi32(m1,m2); \
228  t1 = _mm_unpackhi_epi64(m0,t0); \
229  buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
230 
231  #define BLAKE2S_LOAD_MSG_7_1(buf) \
232  t0 = _mm_unpackhi_epi32(m0,m1); \
233  t1 = _mm_blend_epi16(t0,m3,0x0F); \
234  buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
235 
236  #define BLAKE2S_LOAD_MSG_7_2(buf) \
237  t0 = _mm_blend_epi16(m2,m3,0x30); \
238  t1 = _mm_srli_si128(m0,4); \
239  t2 = _mm_blend_epi16(t0,t1,0x03); \
240  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
241 
242  #define BLAKE2S_LOAD_MSG_7_3(buf) \
243  t0 = _mm_unpackhi_epi64(m0,m3); \
244  t1 = _mm_unpacklo_epi64(m1,m2); \
245  t2 = _mm_blend_epi16(t0,t1,0x3C); \
246  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
247 
248  #define BLAKE2S_LOAD_MSG_7_4(buf) \
249  t0 = _mm_unpacklo_epi32(m0,m1); \
250  t1 = _mm_unpackhi_epi32(m1,m2); \
251  buf = _mm_unpacklo_epi64(t0,t1);
252 
253  #define BLAKE2S_LOAD_MSG_8_1(buf) \
254  t0 = _mm_unpackhi_epi32(m1,m3); \
255  t1 = _mm_unpacklo_epi64(t0,m0); \
256  t2 = _mm_blend_epi16(t1,m2,0xC0); \
257  buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
258 
259  #define BLAKE2S_LOAD_MSG_8_2(buf) \
260  t0 = _mm_unpackhi_epi32(m0,m3); \
261  t1 = _mm_blend_epi16(m2,t0,0xF0); \
262  buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
263 
264  #define BLAKE2S_LOAD_MSG_8_3(buf) \
265  t0 = _mm_blend_epi16(m2,m0,0x0C); \
266  t1 = _mm_slli_si128(t0,4); \
267  buf = _mm_blend_epi16(t1,m3,0x0F);
268 
269  #define BLAKE2S_LOAD_MSG_8_4(buf) \
270  t0 = _mm_blend_epi16(m1,m0,0x30); \
271  buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
272 
273  #define BLAKE2S_LOAD_MSG_9_1(buf) \
274  t0 = _mm_blend_epi16(m0,m2,0x03); \
275  t1 = _mm_blend_epi16(m1,m2,0x30); \
276  t2 = _mm_blend_epi16(t1,t0,0x0F); \
277  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
278 
279  #define BLAKE2S_LOAD_MSG_9_2(buf) \
280  t0 = _mm_slli_si128(m0,4); \
281  t1 = _mm_blend_epi16(m1,t0,0xC0); \
282  buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
283 
284  #define BLAKE2S_LOAD_MSG_9_3(buf) \
285  t0 = _mm_unpackhi_epi32(m0,m3); \
286  t1 = _mm_unpacklo_epi32(m2,m3); \
287  t2 = _mm_unpackhi_epi64(t0,t1); \
288  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
289 
290  #define BLAKE2S_LOAD_MSG_9_4(buf) \
291  t0 = _mm_blend_epi16(m3,m2,0xC0); \
292  t1 = _mm_unpacklo_epi32(m0,m3); \
293  t2 = _mm_blend_epi16(t0,t1,0x0F); \
294  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
295 
296 #ifdef __XOP__
297 # define MM_ROTI_EPI32(r, c) \
298  _mm_roti_epi32(r, c)
299 #else
300 # define MM_ROTI_EPI32(r, c) ( \
301  (8==-(c)) ? _mm_shuffle_epi8(r,r8) \
302  : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \
303  : _mm_xor_si128(_mm_srli_epi32((r), -(c)), \
304  _mm_slli_epi32((r), 32-(-(c)))))
305 #endif
306 
307 #define BLAKE2S_G1(row1,row2,row3,row4,buf) \
308  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf), row2); \
309  row4 = _mm_xor_si128(row4, row1); \
310  row4 = MM_ROTI_EPI32(row4, -16); \
311  row3 = _mm_add_epi32(row3, row4); \
312  row2 = _mm_xor_si128(row2, row3); \
313  row2 = MM_ROTI_EPI32(row2, -12);
314 
315 #define BLAKE2S_G2(row1,row2,row3,row4,buf) \
316  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf), row2); \
317  row4 = _mm_xor_si128(row4, row1); \
318  row4 = MM_ROTI_EPI32(row4, -8); \
319  row3 = _mm_add_epi32(row3, row4); \
320  row2 = _mm_xor_si128(row2, row3); \
321  row2 = MM_ROTI_EPI32(row2, -7);
322 
323 #define DIAGONALIZE(row1,row2,row3,row4) \
324  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); \
325  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); \
326  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
327 
328 #define UNDIAGONALIZE(row1,row2,row3,row4) \
329  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); \
330  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); \
331  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
332 
333 #define BLAKE2S_ROUND(r) \
334  BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
335  BLAKE2S_G1(row1,row2,row3,row4,buf1); \
336  BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \
337  BLAKE2S_G2(row1,row2,row3,row4,buf2); \
338  DIAGONALIZE(row1,row2,row3,row4); \
339  BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \
340  BLAKE2S_G1(row1,row2,row3,row4,buf3); \
341  BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \
342  BLAKE2S_G2(row1,row2,row3,row4,buf4); \
343  UNDIAGONALIZE(row1,row2,row3,row4);
344 
345  __m128i row1, row2, row3, row4;
346  __m128i buf1, buf2, buf3, buf4;
347  __m128i t0, t1, t2, ff0, ff1;
348 
349  const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
350  const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
351 
352  const __m128i m0 = LOADU(input + 00);
353  const __m128i m1 = LOADU(input + 16);
354  const __m128i m2 = LOADU(input + 32);
355  const __m128i m3 = LOADU(input + 48);
356 
357  row1 = ff0 = LOADU(state.h()+0);
358  row2 = ff1 = LOADU(state.h()+4);
359  row3 = LOADU(BLAKE2S_IV+0);
360  row4 = _mm_xor_si128(LOADU(BLAKE2S_IV+4), LOADU(state.t()+0));
361 
362  BLAKE2S_ROUND(0);
363  BLAKE2S_ROUND(1);
364  BLAKE2S_ROUND(2);
365  BLAKE2S_ROUND(3);
366  BLAKE2S_ROUND(4);
367  BLAKE2S_ROUND(5);
368  BLAKE2S_ROUND(6);
369  BLAKE2S_ROUND(7);
370  BLAKE2S_ROUND(8);
371  BLAKE2S_ROUND(9);
372 
373  STOREU(state.h()+0, _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
374  STOREU(state.h()+4, _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
375 }
376 #endif // CRYPTOPP_SSE41_AVAILABLE
377 
378 #if CRYPTOPP_ARM_NEON_AVAILABLE
379 void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
380 {
381  #define BLAKE2S_LOAD_MSG_0_1(buf) \
382  do { uint32x2_t t0, t1; \
383  t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \
384  t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \
385  buf = vcombine_u32(t0, t1); } while(0)
386 
387  #define BLAKE2S_LOAD_MSG_0_2(buf) \
388  do { uint32x2_t t0, t1; \
389  t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \
390  t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \
391  buf = vcombine_u32(t0, t1); } while(0)
392 
393  #define BLAKE2S_LOAD_MSG_0_3(buf) \
394  do { uint32x2_t t0, t1; \
395  t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \
396  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \
397  buf = vcombine_u32(t0, t1); } while(0)
398 
399  #define BLAKE2S_LOAD_MSG_0_4(buf) \
400  do { uint32x2_t t0, t1; \
401  t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \
402  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \
403  buf = vcombine_u32(t0, t1); } while(0)
404 
405  #define BLAKE2S_LOAD_MSG_1_1(buf) \
406  do { uint32x2_t t0, t1; \
407  t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \
408  t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \
409  buf = vcombine_u32(t0, t1); } while(0)
410 
411  #define BLAKE2S_LOAD_MSG_1_2(buf) \
412  do { uint32x2_t t0, t1; \
413  t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \
414  t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \
415  buf = vcombine_u32(t0, t1); } while(0)
416 
417  #define BLAKE2S_LOAD_MSG_1_3(buf) \
418  do { uint32x2_t t0, t1; \
419  t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \
420  t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \
421  buf = vcombine_u32(t0, t1); } while(0)
422 
423  #define BLAKE2S_LOAD_MSG_1_4(buf) \
424  do { uint32x2_t t0, t1; \
425  t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \
426  t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \
427  buf = vcombine_u32(t0, t1); } while(0)
428 
429  #define BLAKE2S_LOAD_MSG_2_1(buf) \
430  do { uint32x2_t t0, t1; \
431  t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \
432  t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \
433  buf = vcombine_u32(t0, t1); } while(0)
434 
435  #define BLAKE2S_LOAD_MSG_2_2(buf) \
436  do { uint32x2_t t0, t1; \
437  t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \
438  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \
439  buf = vcombine_u32(t0, t1); } while(0)
440 
441  #define BLAKE2S_LOAD_MSG_2_3(buf) \
442  do { uint32x2_t t0, t1; \
443  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \
444  t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \
445  buf = vcombine_u32(t0, t1); } while(0)
446 
447  #define BLAKE2S_LOAD_MSG_2_4(buf) \
448  do { uint32x2_t t0, t1; \
449  t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \
450  t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \
451  buf = vcombine_u32(t0, t1); } while(0)
452 
453  #define BLAKE2S_LOAD_MSG_3_1(buf) \
454  do { uint32x2_t t0, t1; \
455  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \
456  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \
457  buf = vcombine_u32(t0, t1); } while(0)
458 
459  #define BLAKE2S_LOAD_MSG_3_2(buf) \
460  do { uint32x2_t t0, t1; \
461  t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \
462  t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \
463  buf = vcombine_u32(t0, t1); } while(0)
464 
465  #define BLAKE2S_LOAD_MSG_3_3(buf) \
466  do { uint32x2_t t0, t1; \
467  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \
468  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \
469  buf = vcombine_u32(t0, t1); } while(0)
470 
471  #define BLAKE2S_LOAD_MSG_3_4(buf) \
472  do { uint32x2_t t0, t1; \
473  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \
474  t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \
475  buf = vcombine_u32(t0, t1); } while(0)
476 
477  #define BLAKE2S_LOAD_MSG_4_1(buf) \
478  do { uint32x2_t t0, t1; \
479  t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \
480  t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \
481  buf = vcombine_u32(t0, t1); } while(0)
482 
483  #define BLAKE2S_LOAD_MSG_4_2(buf) \
484  do { uint32x2_t t0, t1; \
485  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \
486  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \
487  buf = vcombine_u32(t0, t1); } while(0)
488 
489  #define BLAKE2S_LOAD_MSG_4_3(buf) \
490  do { uint32x2_t t0, t1; \
491  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \
492  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \
493  buf = vcombine_u32(t0, t1); } while(0)
494 
495  #define BLAKE2S_LOAD_MSG_4_4(buf) \
496  do { uint32x2_t t0, t1; \
497  t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \
498  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \
499  buf = vcombine_u32(t0, t1); } while(0)
500 
501  #define BLAKE2S_LOAD_MSG_5_1(buf) \
502  do { uint32x2_t t0, t1; \
503  t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \
504  t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \
505  buf = vcombine_u32(t0, t1); } while(0)
506 
507  #define BLAKE2S_LOAD_MSG_5_2(buf) \
508  do { uint32x2_t t0, t1; \
509  t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \
510  t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \
511  buf = vcombine_u32(t0, t1); } while(0)
512 
513  #define BLAKE2S_LOAD_MSG_5_3(buf) \
514  do { uint32x2_t t0, t1; \
515  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \
516  t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \
517  buf = vcombine_u32(t0, t1); } while(0)
518 
519  #define BLAKE2S_LOAD_MSG_5_4(buf) \
520  do { uint32x2_t t0, t1; \
521  t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \
522  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \
523  buf = vcombine_u32(t0, t1); } while(0)
524 
525  #define BLAKE2S_LOAD_MSG_6_1(buf) \
526  do { uint32x2_t t0, t1; \
527  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \
528  t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \
529  buf = vcombine_u32(t0, t1); } while(0)
530 
531  #define BLAKE2S_LOAD_MSG_6_2(buf) \
532  do { uint32x2_t t0, t1; \
533  t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \
534  t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \
535  buf = vcombine_u32(t0, t1); } while(0)
536 
537  #define BLAKE2S_LOAD_MSG_6_3(buf) \
538  do { uint32x2_t t0, t1; \
539  t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \
540  t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \
541  buf = vcombine_u32(t0, t1); } while(0)
542 
543  #define BLAKE2S_LOAD_MSG_6_4(buf) \
544  do { uint32x2_t t0, t1; \
545  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \
546  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \
547  buf = vcombine_u32(t0, t1); } while(0)
548 
549  #define BLAKE2S_LOAD_MSG_7_1(buf) \
550  do { uint32x2_t t0, t1; \
551  t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \
552  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \
553  buf = vcombine_u32(t0, t1); } while(0)
554 
555  #define BLAKE2S_LOAD_MSG_7_2(buf) \
556  do { uint32x2_t t0, t1; \
557  t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \
558  t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \
559  buf = vcombine_u32(t0, t1); } while(0)
560 
561  #define BLAKE2S_LOAD_MSG_7_3(buf) \
562  do { uint32x2_t t0, t1; \
563  t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \
564  t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \
565  buf = vcombine_u32(t0, t1); } while(0)
566 
567  #define BLAKE2S_LOAD_MSG_7_4(buf) \
568  do { uint32x2_t t0, t1; \
569  t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \
570  t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \
571  buf = vcombine_u32(t0, t1); } while(0)
572 
573  #define BLAKE2S_LOAD_MSG_8_1(buf) \
574  do { uint32x2_t t0, t1; \
575  t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \
576  t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \
577  buf = vcombine_u32(t0, t1); } while(0)
578 
579  #define BLAKE2S_LOAD_MSG_8_2(buf) \
580  do { uint32x2_t t0, t1; \
581  t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \
582  t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \
583  buf = vcombine_u32(t0, t1); } while(0)
584 
585  #define BLAKE2S_LOAD_MSG_8_3(buf) \
586  do { uint32x2_t t0, t1; \
587  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \
588  t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \
589  buf = vcombine_u32(t0, t1); } while(0)
590 
591  #define BLAKE2S_LOAD_MSG_8_4(buf) \
592  do { uint32x2_t t0, t1; \
593  t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \
594  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \
595  buf = vcombine_u32(t0, t1); } while(0)
596 
597  #define BLAKE2S_LOAD_MSG_9_1(buf) \
598  do { uint32x2_t t0, t1; \
599  t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \
600  t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \
601  buf = vcombine_u32(t0, t1); } while(0)
602 
603  #define BLAKE2S_LOAD_MSG_9_2(buf) \
604  do { uint32x2_t t0, t1; \
605  t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \
606  t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \
607  buf = vcombine_u32(t0, t1); } while(0)
608 
609  #define BLAKE2S_LOAD_MSG_9_3(buf) \
610  do { uint32x2_t t0, t1; \
611  t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \
612  t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \
613  buf = vcombine_u32(t0, t1); } while(0)
614 
615  #define BLAKE2S_LOAD_MSG_9_4(buf) \
616  do { uint32x2_t t0, t1; \
617  t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \
618  t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \
619  buf = vcombine_u32(t0, t1); } while(0)
620 
621  #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)))
622 
623  #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8)
624 
625  #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c))
626 
627  #define BLAKE2S_G1(row1,row2,row3,row4,buf) \
628  do { \
629  row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \
630  row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \
631  row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \
632  } while(0)
633 
634  #define BLAKE2S_G2(row1,row2,row3,row4,buf) \
635  do { \
636  row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \
637  row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \
638  row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \
639  } while(0)
640 
641  #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
642  do { \
643  row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \
644  } while(0)
645 
646  #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
647  do { \
648  row4 = vextq_u32(row4, row4, 1); \
649  row3 = vextq_u32(row3, row3, 2); \
650  row2 = vextq_u32(row2, row2, 3); \
651  } while(0)
652 
653  #define BLAKE2S_ROUND(r) \
654  do { \
655  uint32x4_t buf1, buf2, buf3, buf4; \
656  BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
657  BLAKE2S_G1(row1,row2,row3,row4,buf1); \
658  BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \
659  BLAKE2S_G2(row1,row2,row3,row4,buf2); \
660  BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \
661  BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \
662  BLAKE2S_G1(row1,row2,row3,row4,buf3); \
663  BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \
664  BLAKE2S_G2(row1,row2,row3,row4,buf4); \
665  BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \
666  } while(0)
667 
668  const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8(input + 00));
669  const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8(input + 16));
670  const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8(input + 32));
671  const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8(input + 48));
672 
673  uint32x4_t row1, row2, row3, row4;
674 
675  const uint32x4_t f0 = row1 = vld1q_u32(state.h()+0);
676  const uint32x4_t f1 = row2 = vld1q_u32(state.h()+4);
677  row3 = vld1q_u32(BLAKE2S_IV+0);
678  row4 = veorq_u32(vld1q_u32(BLAKE2S_IV+4), vld1q_u32(state.t()+0));
679 
680  BLAKE2S_ROUND(0);
681  BLAKE2S_ROUND(1);
682  BLAKE2S_ROUND(2);
683  BLAKE2S_ROUND(3);
684  BLAKE2S_ROUND(4);
685  BLAKE2S_ROUND(5);
686  BLAKE2S_ROUND(6);
687  BLAKE2S_ROUND(7);
688  BLAKE2S_ROUND(8);
689  BLAKE2S_ROUND(9);
690 
691  vst1q_u32(state.h()+0, veorq_u32(f0, veorq_u32(row1, row3)));
692  vst1q_u32(state.h()+4, veorq_u32(f1, veorq_u32(row2, row4)));
693 }
694 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
695 
696 #if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
697 
698 inline uint32x4_p VecLoad32(const void* p)
699 {
700  return VecLoad((const word32*)p);
701 }
702 
703 inline uint32x4_p VecLoad32LE(const void* p)
704 {
705 #if __BIG_ENDIAN__
706  const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
707  const uint32x4_p v = VecLoad((const word32*)p);
708  return VecPermute(v, v, m);
709 #else
710  return VecLoad((const word32*)p);
711 #endif
712 }
713 
714 inline void VecStore32(void* p, const uint32x4_p x)
715 {
716  VecStore(x, (word32*)p);
717 }
718 
719 inline void VecStore32LE(void* p, const uint32x4_p x)
720 {
721 #if __BIG_ENDIAN__
722  const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
723  VecStore(VecPermute(x, x, m), (word32*)p);
724 #else
725  VecStore(x, (word32*)p);
726 #endif
727 }
728 
729 template <unsigned int E1, unsigned int E2>
730 inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
731 {
732  // Re-index. I'd like to use something like Z=Y*4 and then
733  // VecShiftLeftOctet<Z>(b) but it crashes early Red Hat
734  // GCC compilers.
735  enum {X=E1&3, Y=E2&3};
736 
737  // Don't care element
738  const unsigned int DC = 31;
739 
740  // Element 0 combinations
741  if (X == 0 && Y == 0)
742  {
743  const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
744  return VecPermute(a, b, mask);
745  }
746  else if (X == 0 && Y == 1)
747  {
748  const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
749  return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
750  }
751  else if (X == 0 && Y == 2)
752  {
753  const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
754  return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
755  }
756  else if (X == 0 && Y == 3)
757  {
758  const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
759  return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
760  }
761 
762  // Element 1 combinations
763  else if (X == 1 && Y == 0)
764  {
765  const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
766  return VecPermute(a, b, mask);
767  }
768  else if (X == 1 && Y == 1)
769  {
770  const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
771  return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
772  }
773  else if (X == 1 && Y == 2)
774  {
775  const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
776  return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
777  }
778  else if (X == 1 && Y == 3)
779  {
780  const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
781  return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
782  }
783 
784  // Element 2 combinations
785  else if (X == 2 && Y == 0)
786  {
787  const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
788  return VecPermute(a, b, mask);
789  }
790  else if (X == 2 && Y == 1)
791  {
792  const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
793  return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
794  }
795  else if (X == 2 && Y == 2)
796  {
797  const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
798  return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
799  }
800  else if (X == 2 && Y == 3)
801  {
802  const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
803  return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
804  }
805 
806  // Element 3 combinations
807  else if (X == 3 && Y == 0)
808  {
809  const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
810  return VecPermute(a, b, mask);
811  }
812  else if (X == 3 && Y == 1)
813  {
814  const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
815  return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
816  }
817  else if (X == 3 && Y == 2)
818  {
819  const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
820  return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
821  }
822  else if (X == 3 && Y == 3)
823  {
824  const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
825  return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
826  }
827 
828  // Quiet IBM XLC warning
829  return VecXor(a, a);
830 }
831 
832 template <unsigned int E1, unsigned int E2, unsigned int E3, unsigned int E4>
833 inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b,
834  const uint32x4_p c, const uint32x4_p d)
835 {
836  // Re-index
837  enum {W=E1&3, X=E2&3, Y=E3&3, Z=E4&3};
838 
839  const uint32x4_p t0 = VectorSet32<W,X>(a, b);
840  const uint32x4_p t1 = VectorSet32<Y,Z>(c, d);
841 
842  // Power7 follows SSE2's implementation, and this is _mm_set_epi32.
843  const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3};
844  return VecPermute(t0, t1, mask);
845 }
846 
847 template<>
848 uint32x4_p VectorSet32<2,0,2,0>(const uint32x4_p a, const uint32x4_p b,
849  const uint32x4_p c, const uint32x4_p d)
850 {
851  // a=b, c=d, mask is {2,0, 2,0}
852  const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11};
853  return VecPermute(a, c, mask);
854 }
855 
856 template<>
857 uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
858  const uint32x4_p c, const uint32x4_p d)
859 {
860  // a=b, c=d, mask is {3,1, 3,1}
861  const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15};
862  return VecPermute(a, c, mask);
863 }
864 
865 // BLAKE2_Compress32_CORE will use either POWER7 or ALTIVEC,
866 // depending on the flags used to compile this source file. The
867 // abstractions are handled in VecLoad, VecStore and friends. In
868 // the future we may to provide both POWER7 or ALTIVEC at the same
869 // time to better support distros.
870 void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
871 {
872  # define m1 m0
873  # define m2 m0
874  # define m3 m0
875 
876  # define m5 m4
877  # define m6 m4
878  # define m7 m4
879 
880  # define m9 m8
881  # define m10 m8
882  # define m11 m8
883 
884  # define m13 m12
885  # define m14 m12
886  # define m15 m12
887 
888  // #define BLAKE2S_LOAD_MSG_0_1(buf) buf = VectorSet32<6,4,2,0>(m6,m4,m2,m0);
889  #define BLAKE2S_LOAD_MSG_0_1(buf) buf = VectorSet32<2,0,2,0>(m6,m4,m2,m0);
890  // #define BLAKE2S_LOAD_MSG_0_2(buf) buf = VectorSet32<7,5,3,1>(m7,m5,m3,m1);
891  #define BLAKE2S_LOAD_MSG_0_2(buf) buf = VectorSet32<3,1,3,1>(m7,m5,m3,m1);
892  // #define BLAKE2S_LOAD_MSG_0_3(buf) buf = VectorSet32<14,12,10,8>(m14,m12,m10,m8);
893  #define BLAKE2S_LOAD_MSG_0_3(buf) buf = VectorSet32<2,0,2,0>(m14,m12,m10,m8);
894  // #define BLAKE2S_LOAD_MSG_0_4(buf) buf = VectorSet32<15,13,11,9>(m15,m13,m11,m9);
895  #define BLAKE2S_LOAD_MSG_0_4(buf) buf = VectorSet32<3,1,3,1>(m15,m13,m11,m9);
896 
897  #define BLAKE2S_LOAD_MSG_1_1(buf) buf = VectorSet32<13,9,4,14>(m13,m9,m4,m14);
898  #define BLAKE2S_LOAD_MSG_1_2(buf) buf = VectorSet32<6,15,8,10>(m6,m15,m8,m10)
899  #define BLAKE2S_LOAD_MSG_1_3(buf) buf = VectorSet32<5,11,0,1>(m5,m11,m0,m1)
900  #define BLAKE2S_LOAD_MSG_1_4(buf) buf = VectorSet32<3,7,2,12>(m3,m7,m2,m12)
901 
902  #define BLAKE2S_LOAD_MSG_2_1(buf) buf = VectorSet32<15,5,12,11>(m15,m5,m12,m11)
903  #define BLAKE2S_LOAD_MSG_2_2(buf) buf = VectorSet32<13,2,0,8>(m13,m2,m0,m8)
904  #define BLAKE2S_LOAD_MSG_2_3(buf) buf = VectorSet32<9,7,3,10>(m9,m7,m3,m10)
905  #define BLAKE2S_LOAD_MSG_2_4(buf) buf = VectorSet32<4,1,6,14>(m4,m1,m6,m14)
906 
907  #define BLAKE2S_LOAD_MSG_3_1(buf) buf = VectorSet32<11,13,3,7>(m11,m13,m3,m7)
908  #define BLAKE2S_LOAD_MSG_3_2(buf) buf = VectorSet32<14,12,1,9>(m14,m12,m1,m9)
909  #define BLAKE2S_LOAD_MSG_3_3(buf) buf = VectorSet32<15,4,5,2>(m15,m4,m5,m2)
910  #define BLAKE2S_LOAD_MSG_3_4(buf) buf = VectorSet32<8,0,10,6>(m8,m0,m10,m6)
911 
912  #define BLAKE2S_LOAD_MSG_4_1(buf) buf = VectorSet32<10,2,5,9>(m10,m2,m5,m9)
913  #define BLAKE2S_LOAD_MSG_4_2(buf) buf = VectorSet32<15,4,7,0>(m15,m4,m7,m0)
914  #define BLAKE2S_LOAD_MSG_4_3(buf) buf = VectorSet32<3,6,11,14>(m3,m6,m11,m14)
915  #define BLAKE2S_LOAD_MSG_4_4(buf) buf = VectorSet32<13,8,12,1>(m13,m8,m12,m1)
916 
917  #define BLAKE2S_LOAD_MSG_5_1(buf) buf = VectorSet32<8,0,6,2>(m8,m0,m6,m2)
918  #define BLAKE2S_LOAD_MSG_5_2(buf) buf = VectorSet32<3,11,10,12>(m3,m11,m10,m12)
919  #define BLAKE2S_LOAD_MSG_5_3(buf) buf = VectorSet32<1,15,7,4>(m1,m15,m7,m4)
920  #define BLAKE2S_LOAD_MSG_5_4(buf) buf = VectorSet32<9,14,5,13>(m9,m14,m5,m13)
921 
922  #define BLAKE2S_LOAD_MSG_6_1(buf) buf = VectorSet32<4,14,1,12>(m4,m14,m1,m12)
923  #define BLAKE2S_LOAD_MSG_6_2(buf) buf = VectorSet32<10,13,15,5>(m10,m13,m15,m5)
924  #define BLAKE2S_LOAD_MSG_6_3(buf) buf = VectorSet32<8,9,6,0>(m8,m9,m6,m0)
925  #define BLAKE2S_LOAD_MSG_6_4(buf) buf = VectorSet32<11,2,3,7>(m11,m2,m3,m7)
926 
927  #define BLAKE2S_LOAD_MSG_7_1(buf) buf = VectorSet32<3,12,7,13>(m3,m12,m7,m13)
928  #define BLAKE2S_LOAD_MSG_7_2(buf) buf = VectorSet32<9,1,14,11>(m9,m1,m14,m11)
929  #define BLAKE2S_LOAD_MSG_7_3(buf) buf = VectorSet32<2,8,15,5>(m2,m8,m15,m5)
930  #define BLAKE2S_LOAD_MSG_7_4(buf) buf = VectorSet32<10,6,4,0>(m10,m6,m4,m0)
931 
932  #define BLAKE2S_LOAD_MSG_8_1(buf) buf = VectorSet32<0,11,14,6>(m0,m11,m14,m6)
933  #define BLAKE2S_LOAD_MSG_8_2(buf) buf = VectorSet32<8,3,9,15>(m8,m3,m9,m15)
934  #define BLAKE2S_LOAD_MSG_8_3(buf) buf = VectorSet32<10,1,13,12>(m10,m1,m13,m12)
935  #define BLAKE2S_LOAD_MSG_8_4(buf) buf = VectorSet32<5,4,7,2>(m5,m4,m7,m2)
936 
937  #define BLAKE2S_LOAD_MSG_9_1(buf) buf = VectorSet32<1,7,8,10>(m1,m7,m8,m10)
938  #define BLAKE2S_LOAD_MSG_9_2(buf) buf = VectorSet32<5,6,4,2>(m5,m6,m4,m2)
939  #define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15)
940  #define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11)
941 
942  #define vec_ror_16(x) VecRotateRight<16>(x)
943  #define vec_ror_12(x) VecRotateRight<12>(x)
944  #define vec_ror_8(x) VecRotateRight<8>(x)
945  #define vec_ror_7(x) VecRotateRight<7>(x)
946 
947  #define BLAKE2S_G1(row1,row2,row3,row4,buf) \
948  row1 = VecAdd(VecAdd(row1, buf), row2); \
949  row4 = VecXor(row4, row1); \
950  row4 = vec_ror_16(row4); \
951  row3 = VecAdd(row3, row4); \
952  row2 = VecXor(row2, row3); \
953  row2 = vec_ror_12(row2);
954 
955  #define BLAKE2S_G2(row1,row2,row3,row4,buf) \
956  row1 = VecAdd(VecAdd(row1, buf), row2); \
957  row4 = VecXor(row4, row1); \
958  row4 = vec_ror_8(row4); \
959  row3 = VecAdd(row3, row4); \
960  row2 = VecXor(row2, row3); \
961  row2 = vec_ror_7(row2);
962 
963  const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
964  const uint8x16_p D1032_MASK = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
965  const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
966 
967  #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
968  row4 = VecPermute(row4, row4, D2103_MASK); \
969  row3 = VecPermute(row3, row3, D1032_MASK); \
970  row2 = VecPermute(row2, row2, D0321_MASK);
971 
972  #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
973  row4 = VecPermute(row4, row4, D0321_MASK); \
974  row3 = VecPermute(row3, row3, D1032_MASK); \
975  row2 = VecPermute(row2, row2, D2103_MASK);
976 
977  #define BLAKE2S_ROUND(r) \
978  BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
979  BLAKE2S_G1(row1,row2,row3,row4,buf1); \
980  BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \
981  BLAKE2S_G2(row1,row2,row3,row4,buf2); \
982  BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \
983  BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \
984  BLAKE2S_G1(row1,row2,row3,row4,buf3); \
985  BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \
986  BLAKE2S_G2(row1,row2,row3,row4,buf4); \
987  BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);
988 
989  uint32x4_p row1, row2, row3, row4;
990  uint32x4_p buf1, buf2, buf3, buf4;
991  uint32x4_p ff0, ff1;
992 
993  const uint32x4_p m0 = VecLoad32LE(input + 0);
994  const uint32x4_p m4 = VecLoad32LE(input + 16);
995  const uint32x4_p m8 = VecLoad32LE(input + 32);
996  const uint32x4_p m12 = VecLoad32LE(input + 48);
997 
998  row1 = ff0 = VecLoad32LE(state.h()+0);
999  row2 = ff1 = VecLoad32LE(state.h()+4);
1000  row3 = VecLoad32(BLAKE2S_IV+0);
1001  row4 = VecXor(VecLoad32(BLAKE2S_IV+4), VecLoad32(state.t()+0));
1002 
1003  BLAKE2S_ROUND(0);
1004  BLAKE2S_ROUND(1);
1005  BLAKE2S_ROUND(2);
1006  BLAKE2S_ROUND(3);
1007  BLAKE2S_ROUND(4);
1008  BLAKE2S_ROUND(5);
1009  BLAKE2S_ROUND(6);
1010  BLAKE2S_ROUND(7);
1011  BLAKE2S_ROUND(8);
1012  BLAKE2S_ROUND(9);
1013 
1014  VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
1015  VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
1016 }
1017 #endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
1018 
1019 #if (CRYPTOPP_POWER7_AVAILABLE)
1020 
1021 void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
1022 {
1023  BLAKE2_Compress32_CORE(input, state);
1024 }
1025 
1026 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)
1027 
1028 void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
1029 {
1030  BLAKE2_Compress32_CORE(input, state);
1031 }
1032 
1033 #endif
1034 
1035 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Precompiled header file.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:600
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
BLAKE2s state information.
Definition: blake2.h:163
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:251
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118