Crypto++  8.0
Free C++ class library of cryptographic schemes
simon128_simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both simon.cpp and simon-simd.cpp.
17 // #undef CRYPTOPP_SSSE3_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__) && defined(__AVX512VL__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
35 # include <arm_neon.h>
36 #endif
37 
38 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
39 // compilers don't follow ACLE conventions for the include.
40 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
41 # include <stdint.h>
42 # include <arm_acle.h>
43 #endif
44 
45 #if defined(CRYPTOPP_POWER7_AVAILABLE)
46 # include "ppc_simd.h"
47 #endif
48 
49 // Squash MS LNK4221 and libtool warnings
50 extern const char SIMON128_SIMD_FNAME[] = __FILE__;
51 
52 ANONYMOUS_NAMESPACE_BEGIN
53 
54 using CryptoPP::byte;
55 using CryptoPP::word32;
56 using CryptoPP::word64;
57 using CryptoPP::vec_swap; // SunCC
58 
59 // *************************** ARM NEON ************************** //
60 
61 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
62 
63 template <class T>
64 inline T UnpackHigh64(const T& a, const T& b)
65 {
66  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
67  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
68  return (T)vcombine_u64(x, y);
69 }
70 
71 template <class T>
72 inline T UnpackLow64(const T& a, const T& b)
73 {
74  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
75  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
76  return (T)vcombine_u64(x, y);
77 }
78 
79 template <unsigned int R>
80 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
81 {
82  const uint64x2_t a(vshlq_n_u64(val, R));
83  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
84  return vorrq_u64(a, b);
85 }
86 
87 template <unsigned int R>
88 inline uint64x2_t RotateRight64(const uint64x2_t& val)
89 {
90  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
91  const uint64x2_t b(vshrq_n_u64(val, R));
92  return vorrq_u64(a, b);
93 }
94 
95 #if defined(__aarch32__) || defined(__aarch64__)
96 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
97 template <>
98 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
99 {
100 #if (CRYPTOPP_BIG_ENDIAN)
101  const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
102  const uint8x16_t mask = vld1q_u8(maskb);
103 #else
104  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
105  const uint8x16_t mask = vld1q_u8(maskb);
106 #endif
107 
108  return vreinterpretq_u64_u8(
109  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
110 }
111 
112 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
113 template <>
114 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
115 {
116 #if (CRYPTOPP_BIG_ENDIAN)
117  const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
118  const uint8x16_t mask = vld1q_u8(maskb);
119 #else
120  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
121  const uint8x16_t mask = vld1q_u8(maskb);
122 #endif
123 
124  return vreinterpretq_u64_u8(
125  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
126 }
127 #endif
128 
129 inline uint64x2_t SIMON128_f(const uint64x2_t& val)
130 {
131  return veorq_u64(RotateLeft64<2>(val),
132  vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
133 }
134 
135 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
136  const word64 *subkeys, unsigned int rounds)
137 {
138  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
139  uint64x2_t x1 = UnpackHigh64(block0, block1);
140  uint64x2_t y1 = UnpackLow64(block0, block1);
141 
142  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
143  {
144  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
145  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
146 
147  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
148  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
149  }
150 
151  if (rounds & 1)
152  {
153  const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
154 
155  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
156  std::swap(x1, y1);
157  }
158 
159  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
160  block0 = UnpackLow64(y1, x1);
161  block1 = UnpackHigh64(y1, x1);
162 }
163 
164 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
165  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
166  const word64 *subkeys, unsigned int rounds)
167 {
168  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
169  uint64x2_t x1 = UnpackHigh64(block0, block1);
170  uint64x2_t y1 = UnpackLow64(block0, block1);
171  uint64x2_t x2 = UnpackHigh64(block2, block3);
172  uint64x2_t y2 = UnpackLow64(block2, block3);
173  uint64x2_t x3 = UnpackHigh64(block4, block5);
174  uint64x2_t y3 = UnpackLow64(block4, block5);
175 
176  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
177  {
178  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
179  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
180  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
181  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
182 
183  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
184  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
185  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
186  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
187  }
188 
189  if (rounds & 1)
190  {
191  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
192 
193  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
194  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
195  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
196  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
197  }
198 
199  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
200  block0 = UnpackLow64(y1, x1);
201  block1 = UnpackHigh64(y1, x1);
202  block2 = UnpackLow64(y2, x2);
203  block3 = UnpackHigh64(y2, x2);
204  block4 = UnpackLow64(y3, x3);
205  block5 = UnpackHigh64(y3, x3);
206 }
207 
208 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
209  const word64 *subkeys, unsigned int rounds)
210 {
211  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
212  uint64x2_t x1 = UnpackHigh64(block0, block1);
213  uint64x2_t y1 = UnpackLow64(block0, block1);
214 
215  if (rounds & 1)
216  {
217  std::swap(x1, y1);
218  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
219 
220  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
221  rounds--;
222  }
223 
224  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
225  {
226  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
227  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
228 
229  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
230  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
231  }
232 
233  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
234  block0 = UnpackLow64(y1, x1);
235  block1 = UnpackHigh64(y1, x1);
236 }
237 
238 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
239  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
240  const word64 *subkeys, unsigned int rounds)
241 {
242  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
243  uint64x2_t x1 = UnpackHigh64(block0, block1);
244  uint64x2_t y1 = UnpackLow64(block0, block1);
245  uint64x2_t x2 = UnpackHigh64(block2, block3);
246  uint64x2_t y2 = UnpackLow64(block2, block3);
247  uint64x2_t x3 = UnpackHigh64(block4, block5);
248  uint64x2_t y3 = UnpackLow64(block4, block5);
249 
250  if (rounds & 1)
251  {
252  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
253  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
254 
255  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
256  y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
257  y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
258  rounds--;
259  }
260 
261  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
262  {
263  const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
264  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
265  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
266  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
267 
268  const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
269  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
270  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
271  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
272  }
273 
274  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
275  block0 = UnpackLow64(y1, x1);
276  block1 = UnpackHigh64(y1, x1);
277  block2 = UnpackLow64(y2, x2);
278  block3 = UnpackHigh64(y2, x2);
279  block4 = UnpackLow64(y3, x3);
280  block5 = UnpackHigh64(y3, x3);
281 }
282 
283 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
284 
285 // ***************************** IA-32 ***************************** //
286 
287 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
288 
289 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
290 #ifndef M128_CAST
291 # define M128_CAST(x) ((__m128i *)(void *)(x))
292 #endif
293 #ifndef CONST_M128_CAST
294 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
295 #endif
296 
297 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
298 #ifndef DOUBLE_CAST
299 # define DOUBLE_CAST(x) ((double *)(void *)(x))
300 #endif
301 #ifndef CONST_DOUBLE_CAST
302 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
303 #endif
304 
305 inline void Swap128(__m128i& a,__m128i& b)
306 {
307 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
308  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
309  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
310  vec_swap(a, b);
311 #else
312  std::swap(a, b);
313 #endif
314 }
315 
316 template <unsigned int R>
317 inline __m128i RotateLeft64(const __m128i& val)
318 {
319 #if defined(CRYPTOPP_AVX512_ROTATE)
320  return _mm_rol_epi64(val, R);
321 #elif defined(__XOP__)
322  return _mm_roti_epi64(val, R);
323 #else
324  return _mm_or_si128(
325  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
326 #endif
327 }
328 
329 template <unsigned int R>
330 inline __m128i RotateRight64(const __m128i& val)
331 {
332 #if defined(CRYPTOPP_AVX512_ROTATE)
333  return _mm_ror_epi64(val, R);
334 #elif defined(__XOP__)
335  return _mm_roti_epi64(val, 64-R);
336 #else
337  return _mm_or_si128(
338  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
339 #endif
340 }
341 
342 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
343 template <>
344 __m128i RotateLeft64<8>(const __m128i& val)
345 {
346 #if defined(__XOP__)
347  return _mm_roti_epi64(val, 8);
348 #else
349  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
350  return _mm_shuffle_epi8(val, mask);
351 #endif
352 }
353 
354 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
355 template <>
356 __m128i RotateRight64<8>(const __m128i& val)
357 {
358 #if defined(__XOP__)
359  return _mm_roti_epi64(val, 64-8);
360 #else
361  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
362  return _mm_shuffle_epi8(val, mask);
363 #endif
364 }
365 
366 inline __m128i SIMON128_f(const __m128i& v)
367 {
368  return _mm_xor_si128(RotateLeft64<2>(v),
369  _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
370 }
371 
372 inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
373  const word64 *subkeys, unsigned int rounds)
374 {
375  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
376  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
377  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
378 
379  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
380  {
381  const __m128i rk1 = _mm_castpd_si128(
382  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
383  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
384 
385  const __m128i rk2 = _mm_castpd_si128(
386  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
387  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
388  }
389 
390  if (rounds & 1)
391  {
392  const __m128i rk = _mm_castpd_si128(
393  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
394 
395  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
396  Swap128(x1, y1);
397  }
398 
399  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
400  block0 = _mm_unpacklo_epi64(y1, x1);
401  block1 = _mm_unpackhi_epi64(y1, x1);
402 }
403 
404 inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
405  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
406  const word64 *subkeys, unsigned int rounds)
407 {
408  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
409  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
410  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
411  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
412  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
413  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
414  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
415 
416  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
417  {
418  const __m128i rk1 = _mm_castpd_si128(
419  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
420  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
421  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
422  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
423 
424  const __m128i rk2 = _mm_castpd_si128(
425  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
426  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
427  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
428  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
429  }
430 
431  if (rounds & 1)
432  {
433  const __m128i rk = _mm_castpd_si128(
434  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
435  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
436  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
437  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
438  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
439  }
440 
441  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
442  block0 = _mm_unpacklo_epi64(y1, x1);
443  block1 = _mm_unpackhi_epi64(y1, x1);
444  block2 = _mm_unpacklo_epi64(y2, x2);
445  block3 = _mm_unpackhi_epi64(y2, x2);
446  block4 = _mm_unpacklo_epi64(y3, x3);
447  block5 = _mm_unpackhi_epi64(y3, x3);
448 }
449 
450 inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
451  const word64 *subkeys, unsigned int rounds)
452 {
453  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
454  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
455  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
456 
457  if (rounds & 1)
458  {
459  const __m128i rk = _mm_castpd_si128(
460  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
461 
462  Swap128(x1, y1);
463  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
464  rounds--;
465  }
466 
467  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
468  {
469  const __m128i rk1 = _mm_castpd_si128(
470  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
471  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
472 
473  const __m128i rk2 = _mm_castpd_si128(
474  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
475  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
476  }
477 
478  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
479  block0 = _mm_unpacklo_epi64(y1, x1);
480  block1 = _mm_unpackhi_epi64(y1, x1);
481 }
482 
483 inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
484  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
485  const word64 *subkeys, unsigned int rounds)
486 {
487  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
488  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
489  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
490  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
491  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
492  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
493  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
494 
495  if (rounds & 1)
496  {
497  const __m128i rk = _mm_castpd_si128(
498  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
499 
500  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
501  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
502  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
503  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
504  rounds--;
505  }
506 
507  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
508  {
509  const __m128i rk1 = _mm_castpd_si128(
510  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
511  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
512  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
513  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
514 
515  const __m128i rk2 = _mm_castpd_si128(
516  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
517  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
518  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
519  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
520  }
521 
522  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
523  block0 = _mm_unpacklo_epi64(y1, x1);
524  block1 = _mm_unpackhi_epi64(y1, x1);
525  block2 = _mm_unpacklo_epi64(y2, x2);
526  block3 = _mm_unpackhi_epi64(y2, x2);
527  block4 = _mm_unpacklo_epi64(y3, x3);
528  block5 = _mm_unpackhi_epi64(y3, x3);
529 }
530 
531 #endif // CRYPTOPP_SSSE3_AVAILABLE
532 
533 // ***************************** Power8 ***************************** //
534 
535 #if defined(CRYPTOPP_POWER8_AVAILABLE)
536 
540 
541 using CryptoPP::VecAnd;
542 using CryptoPP::VecXor;
544 
545 // Rotate left by bit count
546 template<unsigned int C>
547 inline uint64x2_p RotateLeft64(const uint64x2_p val)
548 {
549  const uint64x2_p m = {C, C};
550  return vec_rl(val, m);
551 }
552 
553 // Rotate right by bit count
554 template<unsigned int C>
555 inline uint64x2_p RotateRight64(const uint64x2_p val)
556 {
557  const uint64x2_p m = {64-C, 64-C};
558  return vec_rl(val, m);
559 }
560 
561 inline uint64x2_p SIMON128_f(const uint64x2_p val)
562 {
563  return VecXor(RotateLeft64<2>(val),
564  VecAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
565 }
566 
567 inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
568 {
569 #if (CRYPTOPP_BIG_ENDIAN)
570  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
571  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
572 #else
573  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
574  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
575 #endif
576 
577  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
578  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
579  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
580 
581  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
582  {
583  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
584  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
585 
586  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
587  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
588  }
589 
590  if (rounds & 1)
591  {
592  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
593  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
594  std::swap(x1, y1);
595  }
596 
597 #if (CRYPTOPP_BIG_ENDIAN)
598  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
599  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
600 #else
601  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
602  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
603 #endif
604 
605  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
606  block = (uint32x4_p)VecPermute(x1, y1, m3);
607 }
608 
609 inline void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
610 {
611 #if (CRYPTOPP_BIG_ENDIAN)
612  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
613  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
614 #else
615  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
616  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
617 #endif
618 
619  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
620  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
621  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
622 
623  if (rounds & 1)
624  {
625  std::swap(x1, y1);
626  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
627  y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
628  rounds--;
629  }
630 
631  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
632  {
633  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
634  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
635 
636  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
637  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
638  }
639 
640 #if (CRYPTOPP_BIG_ENDIAN)
641  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
642  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
643 #else
644  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
645  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
646 #endif
647 
648  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
649  block = (uint32x4_p)VecPermute(x1, y1, m3);
650 }
651 
652 inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
653  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
654  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
655 {
656 #if (CRYPTOPP_BIG_ENDIAN)
657  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
658  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
659 #else
660  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
661  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
662 #endif
663 
664  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
665  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
666  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
667  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
668  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
669  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
670  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
671 
672  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
673  {
674  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
675  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
676  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
677  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1);
678 
679  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
680  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
681  x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2);
682  x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2);
683  }
684 
685  if (rounds & 1)
686  {
687  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
688  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
689  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
690  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk);
691  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
692  }
693 
694 #if (CRYPTOPP_BIG_ENDIAN)
695  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
696  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
697 #else
698  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
699  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
700 #endif
701 
702  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
703  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
704  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
705  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
706  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
707  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
708  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
709 }
710 
711 inline void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
712  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
713  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
714 {
715 #if (CRYPTOPP_BIG_ENDIAN)
716  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
717  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
718 #else
719  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
720  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
721 #endif
722 
723  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
724  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
725  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
726  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
727  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
728  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
729  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
730 
731  if (rounds & 1)
732  {
733  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
734  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
735  y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
736  y2 = VecXor(VecXor(y2, rk), SIMON128_f(x2));
737  y3 = VecXor(VecXor(y3, rk), SIMON128_f(x3));
738  rounds--;
739  }
740 
741  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
742  {
743  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
744  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
745  x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk1);
746  x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk1);
747 
748  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
749  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
750  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk2);
751  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk2);
752  }
753 
754 #if (CRYPTOPP_BIG_ENDIAN)
755  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
756  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
757 #else
758  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
759  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
760 #endif
761 
762  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
763  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
764  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
765  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
766  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
767  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
768  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
769 }
770 
771 #endif // CRYPTOPP_POWER8_AVAILABLE
772 
773 ANONYMOUS_NAMESPACE_END
774 
775 ///////////////////////////////////////////////////////////////////////
776 
777 NAMESPACE_BEGIN(CryptoPP)
778 
779 // *************************** ARM NEON **************************** //
780 
781 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
782 size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
783  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
784 {
785  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
786  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
787 }
788 
789 size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
790  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
791 {
792  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
793  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
794 }
795 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
796 
797 // ***************************** IA-32 ***************************** //
798 
799 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
800 size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
801  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
802 {
803  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
804  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
805 }
806 
807 size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
808  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
809 {
810  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
811  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
812 }
813 #endif // CRYPTOPP_SSSE3_AVAILABLE
814 
815 // ***************************** Power8 ***************************** //
816 
817 #if defined(CRYPTOPP_POWER8_AVAILABLE)
818 size_t SIMON128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
819  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
820 {
821  return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
822  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
823 }
824 
825 size_t SIMON128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
826  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
827 {
828  return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
829  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
830 }
831 #endif // CRYPTOPP_POWER8_AVAILABLE
832 
833 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:138
Classes for the Simon block cipher.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:911
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:527