Crypto++  8.0
Free C++ class library of cryptographic schemes
speck128_simd.cpp
1 // speck128_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both speck.cpp and speck-simd.cpp.
17 // #undef CRYPTOPP_SSSE3_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__) && defined(__AVX512VL__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
35 # include <arm_neon.h>
36 #endif
37 
38 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
39 // compilers don't follow ACLE conventions for the include.
40 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
41 # include <stdint.h>
42 # include <arm_acle.h>
43 #endif
44 
45 #if defined(CRYPTOPP_POWER8_AVAILABLE)
46 # include "ppc_simd.h"
47 #endif
48 
49 // Squash MS LNK4221 and libtool warnings
50 extern const char SPECK128_SIMD_FNAME[] = __FILE__;
51 
52 ANONYMOUS_NAMESPACE_BEGIN
53 
54 using CryptoPP::byte;
55 using CryptoPP::word32;
56 using CryptoPP::word64;
57 
58 // *************************** ARM NEON ************************** //
59 
60 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
61 
62 template <class T>
63 inline T UnpackHigh64(const T& a, const T& b)
64 {
65  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
66  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
67  return (T)vcombine_u64(x, y);
68 }
69 
70 template <class T>
71 inline T UnpackLow64(const T& a, const T& b)
72 {
73  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
74  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
75  return (T)vcombine_u64(x, y);
76 }
77 
78 template <unsigned int R>
79 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
80 {
81  const uint64x2_t a(vshlq_n_u64(val, R));
82  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
83  return vorrq_u64(a, b);
84 }
85 
86 template <unsigned int R>
87 inline uint64x2_t RotateRight64(const uint64x2_t& val)
88 {
89  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
90  const uint64x2_t b(vshrq_n_u64(val, R));
91  return vorrq_u64(a, b);
92 }
93 
94 #if defined(__aarch32__) || defined(__aarch64__)
95 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
96 template <>
97 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
98 {
99 #if (CRYPTOPP_BIG_ENDIAN)
100  const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
101  const uint8x16_t mask = vld1q_u8(maskb);
102 #else
103  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
104  const uint8x16_t mask = vld1q_u8(maskb);
105 #endif
106 
107  return vreinterpretq_u64_u8(
108  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
109 }
110 
111 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
112 template <>
113 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
114 {
115 #if (CRYPTOPP_BIG_ENDIAN)
116  const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
117  const uint8x16_t mask = vld1q_u8(maskb);
118 #else
119  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
120  const uint8x16_t mask = vld1q_u8(maskb);
121 #endif
122 
123  return vreinterpretq_u64_u8(
124  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
125 }
126 #endif
127 
128 inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
129  const word64 *subkeys, unsigned int rounds)
130 {
131  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
132  uint64x2_t x1 = UnpackHigh64(block0, block1);
133  uint64x2_t y1 = UnpackLow64(block0, block1);
134 
135  for (int i=0; i < static_cast<int>(rounds); ++i)
136  {
137  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
138 
139  x1 = RotateRight64<8>(x1);
140  x1 = vaddq_u64(x1, y1);
141  x1 = veorq_u64(x1, rk);
142  y1 = RotateLeft64<3>(y1);
143  y1 = veorq_u64(y1, x1);
144  }
145 
146  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
147  block0 = UnpackLow64(y1, x1);
148  block1 = UnpackHigh64(y1, x1);
149 }
150 
151 inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
152  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
153  const word64 *subkeys, unsigned int rounds)
154 {
155  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
156  uint64x2_t x1 = UnpackHigh64(block0, block1);
157  uint64x2_t y1 = UnpackLow64(block0, block1);
158  uint64x2_t x2 = UnpackHigh64(block2, block3);
159  uint64x2_t y2 = UnpackLow64(block2, block3);
160  uint64x2_t x3 = UnpackHigh64(block4, block5);
161  uint64x2_t y3 = UnpackLow64(block4, block5);
162 
163  for (int i=0; i < static_cast<int>(rounds); ++i)
164  {
165  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
166 
167  x1 = RotateRight64<8>(x1);
168  x2 = RotateRight64<8>(x2);
169  x3 = RotateRight64<8>(x3);
170  x1 = vaddq_u64(x1, y1);
171  x2 = vaddq_u64(x2, y2);
172  x3 = vaddq_u64(x3, y3);
173  x1 = veorq_u64(x1, rk);
174  x2 = veorq_u64(x2, rk);
175  x3 = veorq_u64(x3, rk);
176  y1 = RotateLeft64<3>(y1);
177  y2 = RotateLeft64<3>(y2);
178  y3 = RotateLeft64<3>(y3);
179  y1 = veorq_u64(y1, x1);
180  y2 = veorq_u64(y2, x2);
181  y3 = veorq_u64(y3, x3);
182  }
183 
184  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
185  block0 = UnpackLow64(y1, x1);
186  block1 = UnpackHigh64(y1, x1);
187  block2 = UnpackLow64(y2, x2);
188  block3 = UnpackHigh64(y2, x2);
189  block4 = UnpackLow64(y3, x3);
190  block5 = UnpackHigh64(y3, x3);
191 }
192 
193 inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
194  const word64 *subkeys, unsigned int rounds)
195 {
196  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
197  uint64x2_t x1 = UnpackHigh64(block0, block1);
198  uint64x2_t y1 = UnpackLow64(block0, block1);
199 
200  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
201  {
202  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
203 
204  y1 = veorq_u64(y1, x1);
205  y1 = RotateRight64<3>(y1);
206  x1 = veorq_u64(x1, rk);
207  x1 = vsubq_u64(x1, y1);
208  x1 = RotateLeft64<8>(x1);
209  }
210 
211  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
212  block0 = UnpackLow64(y1, x1);
213  block1 = UnpackHigh64(y1, x1);
214 }
215 
216 inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
217  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
218  const word64 *subkeys, unsigned int rounds)
219 {
220  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
221  uint64x2_t x1 = UnpackHigh64(block0, block1);
222  uint64x2_t y1 = UnpackLow64(block0, block1);
223  uint64x2_t x2 = UnpackHigh64(block2, block3);
224  uint64x2_t y2 = UnpackLow64(block2, block3);
225  uint64x2_t x3 = UnpackHigh64(block4, block5);
226  uint64x2_t y3 = UnpackLow64(block4, block5);
227 
228  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
229  {
230  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
231 
232  y1 = veorq_u64(y1, x1);
233  y2 = veorq_u64(y2, x2);
234  y3 = veorq_u64(y3, x3);
235  y1 = RotateRight64<3>(y1);
236  y2 = RotateRight64<3>(y2);
237  y3 = RotateRight64<3>(y3);
238  x1 = veorq_u64(x1, rk);
239  x2 = veorq_u64(x2, rk);
240  x3 = veorq_u64(x3, rk);
241  x1 = vsubq_u64(x1, y1);
242  x2 = vsubq_u64(x2, y2);
243  x3 = vsubq_u64(x3, y3);
244  x1 = RotateLeft64<8>(x1);
245  x2 = RotateLeft64<8>(x2);
246  x3 = RotateLeft64<8>(x3);
247  }
248 
249  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
250  block0 = UnpackLow64(y1, x1);
251  block1 = UnpackHigh64(y1, x1);
252  block2 = UnpackLow64(y2, x2);
253  block3 = UnpackHigh64(y2, x2);
254  block4 = UnpackLow64(y3, x3);
255  block5 = UnpackHigh64(y3, x3);
256 }
257 
258 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
259 
260 // ***************************** IA-32 ***************************** //
261 
262 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
263 
264 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
265 #ifndef M128_CAST
266 # define M128_CAST(x) ((__m128i *)(void *)(x))
267 #endif
268 #ifndef CONST_M128_CAST
269 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
270 #endif
271 
272 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
273 #ifndef DOUBLE_CAST
274 # define DOUBLE_CAST(x) ((double *)(void *)(x))
275 #endif
276 #ifndef CONST_DOUBLE_CAST
277 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
278 #endif
279 
280 template <unsigned int R>
281 inline __m128i RotateLeft64(const __m128i& val)
282 {
283 #if defined(CRYPTOPP_AVX512_ROTATE)
284  return _mm_rol_epi64(val, R);
285 #elif defined(__XOP__)
286  return _mm_roti_epi64(val, R);
287 #else
288  return _mm_or_si128(
289  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
290 #endif
291 }
292 
293 template <unsigned int R>
294 inline __m128i RotateRight64(const __m128i& val)
295 {
296 #if defined(CRYPTOPP_AVX512_ROTATE)
297  return _mm_ror_epi64(val, R);
298 #elif defined(__XOP__)
299  return _mm_roti_epi64(val, 64-R);
300 #else
301  return _mm_or_si128(
302  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
303 #endif
304 }
305 
306 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
307 template <>
308 __m128i RotateLeft64<8>(const __m128i& val)
309 {
310 #if defined(__XOP__)
311  return _mm_roti_epi64(val, 8);
312 #else
313  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
314  return _mm_shuffle_epi8(val, mask);
315 #endif
316 }
317 
318 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
319 template <>
320 __m128i RotateRight64<8>(const __m128i& val)
321 {
322 #if defined(__XOP__)
323  return _mm_roti_epi64(val, 64-8);
324 #else
325  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
326  return _mm_shuffle_epi8(val, mask);
327 #endif
328 }
329 
330 inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
331  const word64 *subkeys, unsigned int rounds)
332 {
333  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
334  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
335  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
336 
337  for (int i=0; i < static_cast<int>(rounds); ++i)
338  {
339  const __m128i rk = _mm_castpd_si128(
340  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
341 
342  x1 = RotateRight64<8>(x1);
343  x1 = _mm_add_epi64(x1, y1);
344  x1 = _mm_xor_si128(x1, rk);
345  y1 = RotateLeft64<3>(y1);
346  y1 = _mm_xor_si128(y1, x1);
347  }
348 
349  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
350  block0 = _mm_unpacklo_epi64(y1, x1);
351  block1 = _mm_unpackhi_epi64(y1, x1);
352 }
353 
354 inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
355  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
356  const word64 *subkeys, unsigned int rounds)
357 {
358  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
359  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
360  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
361  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
362  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
363  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
364  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
365 
366  for (int i=0; i < static_cast<int>(rounds); ++i)
367  {
368  const __m128i rk = _mm_castpd_si128(
369  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
370 
371  x1 = RotateRight64<8>(x1);
372  x2 = RotateRight64<8>(x2);
373  x3 = RotateRight64<8>(x3);
374  x1 = _mm_add_epi64(x1, y1);
375  x2 = _mm_add_epi64(x2, y2);
376  x3 = _mm_add_epi64(x3, y3);
377  x1 = _mm_xor_si128(x1, rk);
378  x2 = _mm_xor_si128(x2, rk);
379  x3 = _mm_xor_si128(x3, rk);
380  y1 = RotateLeft64<3>(y1);
381  y2 = RotateLeft64<3>(y2);
382  y3 = RotateLeft64<3>(y3);
383  y1 = _mm_xor_si128(y1, x1);
384  y2 = _mm_xor_si128(y2, x2);
385  y3 = _mm_xor_si128(y3, x3);
386  }
387 
388  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
389  block0 = _mm_unpacklo_epi64(y1, x1);
390  block1 = _mm_unpackhi_epi64(y1, x1);
391  block2 = _mm_unpacklo_epi64(y2, x2);
392  block3 = _mm_unpackhi_epi64(y2, x2);
393  block4 = _mm_unpacklo_epi64(y3, x3);
394  block5 = _mm_unpackhi_epi64(y3, x3);
395 }
396 
397 inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
398  const word64 *subkeys, unsigned int rounds)
399 {
400  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
401  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
402  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
403 
404  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
405  {
406  const __m128i rk = _mm_castpd_si128(
407  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
408 
409  y1 = _mm_xor_si128(y1, x1);
410  y1 = RotateRight64<3>(y1);
411  x1 = _mm_xor_si128(x1, rk);
412  x1 = _mm_sub_epi64(x1, y1);
413  x1 = RotateLeft64<8>(x1);
414  }
415 
416  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
417  block0 = _mm_unpacklo_epi64(y1, x1);
418  block1 = _mm_unpackhi_epi64(y1, x1);
419 }
420 
421 inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
422  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
423  const word64 *subkeys, unsigned int rounds)
424 {
425  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
426  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
427  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
428  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
429  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
430  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
431  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
432 
433  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
434  {
435  const __m128i rk = _mm_castpd_si128(
436  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
437 
438  y1 = _mm_xor_si128(y1, x1);
439  y2 = _mm_xor_si128(y2, x2);
440  y3 = _mm_xor_si128(y3, x3);
441  y1 = RotateRight64<3>(y1);
442  y2 = RotateRight64<3>(y2);
443  y3 = RotateRight64<3>(y3);
444  x1 = _mm_xor_si128(x1, rk);
445  x2 = _mm_xor_si128(x2, rk);
446  x3 = _mm_xor_si128(x3, rk);
447  x1 = _mm_sub_epi64(x1, y1);
448  x2 = _mm_sub_epi64(x2, y2);
449  x3 = _mm_sub_epi64(x3, y3);
450  x1 = RotateLeft64<8>(x1);
451  x2 = RotateLeft64<8>(x2);
452  x3 = RotateLeft64<8>(x3);
453  }
454 
455  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
456  block0 = _mm_unpacklo_epi64(y1, x1);
457  block1 = _mm_unpackhi_epi64(y1, x1);
458  block2 = _mm_unpacklo_epi64(y2, x2);
459  block3 = _mm_unpackhi_epi64(y2, x2);
460  block4 = _mm_unpacklo_epi64(y3, x3);
461  block5 = _mm_unpackhi_epi64(y3, x3);
462 }
463 
464 #endif // CRYPTOPP_SSSE3_AVAILABLE
465 
466 // ***************************** Power8 ***************************** //
467 
468 #if defined(CRYPTOPP_POWER8_AVAILABLE)
469 
473 
474 using CryptoPP::VecAdd;
475 using CryptoPP::VecSub;
476 using CryptoPP::VecXor;
478 
479 // Rotate left by bit count
480 template<unsigned int C>
481 inline uint64x2_p RotateLeft64(const uint64x2_p val)
482 {
483  const uint64x2_p m = {C, C};
484  return vec_rl(val, m);
485 }
486 
487 // Rotate right by bit count
488 template<unsigned int C>
489 inline uint64x2_p RotateRight64(const uint64x2_p val)
490 {
491  const uint64x2_p m = {64-C, 64-C};
492  return vec_rl(val, m);
493 }
494 
495 void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
496 {
497 #if (CRYPTOPP_BIG_ENDIAN)
498  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
499  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
500 #else
501  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
502  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
503 #endif
504 
505  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
506  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
507  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
508 
509  for (int i=0; i < static_cast<int>(rounds); ++i)
510  {
511  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
512 
513  x1 = RotateRight64<8>(x1);
514  x1 = VecAdd(x1, y1);
515  x1 = VecXor(x1, rk);
516 
517  y1 = RotateLeft64<3>(y1);
518  y1 = VecXor(y1, x1);
519  }
520 
521 #if (CRYPTOPP_BIG_ENDIAN)
522  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
523  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
524 #else
525  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
526  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
527 #endif
528 
529  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
530  block = (uint32x4_p)VecPermute(x1, y1, m3);
531 }
532 
533 void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
534 {
535 #if (CRYPTOPP_BIG_ENDIAN)
536  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
537  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
538 #else
539  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
540  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
541 #endif
542 
543  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
544  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
545  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
546 
547  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
548  {
549  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
550 
551  y1 = VecXor(y1, x1);
552  y1 = RotateRight64<3>(y1);
553  x1 = VecXor(x1, rk);
554  x1 = VecSub(x1, y1);
555  x1 = RotateLeft64<8>(x1);
556  }
557 
558 #if (CRYPTOPP_BIG_ENDIAN)
559  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
560  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
561 #else
562  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
563  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
564 #endif
565 
566  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
567  block = (uint32x4_p)VecPermute(x1, y1, m3);
568 }
569 
570 void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
571  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
572  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
573 {
574 #if (CRYPTOPP_BIG_ENDIAN)
575  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
576  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
577 #else
578  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
579  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
580 #endif
581 
582  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
583  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
584  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
585  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
586  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
587  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
588  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
589 
590  for (int i=0; i < static_cast<int>(rounds); ++i)
591  {
592  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
593 
594  x1 = RotateRight64<8>(x1);
595  x2 = RotateRight64<8>(x2);
596  x3 = RotateRight64<8>(x3);
597  x1 = VecAdd(x1, y1);
598  x2 = VecAdd(x2, y2);
599  x3 = VecAdd(x3, y3);
600  x1 = VecXor(x1, rk);
601  x2 = VecXor(x2, rk);
602  x3 = VecXor(x3, rk);
603 
604  y1 = RotateLeft64<3>(y1);
605  y2 = RotateLeft64<3>(y2);
606  y3 = RotateLeft64<3>(y3);
607  y1 = VecXor(y1, x1);
608  y2 = VecXor(y2, x2);
609  y3 = VecXor(y3, x3);
610  }
611 
612 #if (CRYPTOPP_BIG_ENDIAN)
613  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
614  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
615 #else
616  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
617  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
618 #endif
619 
620  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
621  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
622  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
623  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
624  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
625  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
626  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
627 }
628 
629 void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
630  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
631  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
632 {
633 #if (CRYPTOPP_BIG_ENDIAN)
634  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
635  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
636 #else
637  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
638  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
639 #endif
640 
641  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
642  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
643  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
644  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
645  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
646  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
647  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
648 
649  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
650  {
651  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
652 
653  y1 = VecXor(y1, x1);
654  y2 = VecXor(y2, x2);
655  y3 = VecXor(y3, x3);
656  y1 = RotateRight64<3>(y1);
657  y2 = RotateRight64<3>(y2);
658  y3 = RotateRight64<3>(y3);
659 
660  x1 = VecXor(x1, rk);
661  x2 = VecXor(x2, rk);
662  x3 = VecXor(x3, rk);
663  x1 = VecSub(x1, y1);
664  x2 = VecSub(x2, y2);
665  x3 = VecSub(x3, y3);
666  x1 = RotateLeft64<8>(x1);
667  x2 = RotateLeft64<8>(x2);
668  x3 = RotateLeft64<8>(x3);
669  }
670 
671 #if (CRYPTOPP_BIG_ENDIAN)
672  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
673  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
674 #else
675  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
676  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
677 #endif
678 
679  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
680  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
681  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
682  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
683  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
684  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
685  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
686 }
687 
688 #endif // CRYPTOPP_POWER8_AVAILABLE
689 
690 ANONYMOUS_NAMESPACE_END
691 
692 ///////////////////////////////////////////////////////////////////////
693 
694 NAMESPACE_BEGIN(CryptoPP)
695 
696 // *************************** ARM NEON **************************** //
697 
698 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
699 size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
700  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
701 {
702  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
703  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
704 }
705 
706 size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
707  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
708 {
709  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
710  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
711 }
712 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
713 
714 // ***************************** IA-32 ***************************** //
715 
716 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
717 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
718  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
719 {
720  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
721  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
722 }
723 
724 size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
725  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
726 {
727  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
728  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
729 }
730 #endif // CRYPTOPP_SSSE3_AVAILABLE
731 
732 // ***************************** Power8 ***************************** //
733 
734 #if defined(CRYPTOPP_POWER8_AVAILABLE)
735 size_t SPECK128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
736  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
737 {
738  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
739  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
740 }
741 
742 size_t SPECK128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
743  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
744 {
745  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
746  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
747 }
748 #endif // CRYPTOPP_POWER8_AVAILABLE
749 
750 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:980
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:138
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118