Crypto++  8.0
Free C++ class library of cryptographic schemes
simon64_simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both simon.cpp and simon-simd.cpp.
17 // #undef CRYPTOPP_SSE41_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__) && defined(__AVX512VL__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
39 # include <arm_neon.h>
40 #endif
41 
42 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
43 // compilers don't follow ACLE conventions for the include.
44 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
45 # include <stdint.h>
46 # include <arm_acle.h>
47 #endif
48 
49 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
50 # include "ppc_simd.h"
51 #endif
52 
53 // Squash MS LNK4221 and libtool warnings
54 extern const char SIMON64_SIMD_FNAME[] = __FILE__;
55 
56 ANONYMOUS_NAMESPACE_BEGIN
57 
58 using CryptoPP::byte;
59 using CryptoPP::word32;
60 using CryptoPP::word64;
61 using CryptoPP::vec_swap; // SunCC
62 
63 // *************************** ARM NEON ************************** //
64 
65 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
66 
67 template <class T>
68 inline T UnpackHigh32(const T& a, const T& b)
69 {
70  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
71  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
72  const uint32x2x2_t r = vzip_u32(x, y);
73  return (T)vcombine_u32(r.val[0], r.val[1]);
74 }
75 
76 template <class T>
77 inline T UnpackLow32(const T& a, const T& b)
78 {
79  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
80  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
81  const uint32x2x2_t r = vzip_u32(x, y);
82  return (T)vcombine_u32(r.val[0], r.val[1]);
83 }
84 
85 template <unsigned int R>
86 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
87 {
88  const uint32x4_t a(vshlq_n_u32(val, R));
89  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
90  return vorrq_u32(a, b);
91 }
92 
93 template <unsigned int R>
94 inline uint32x4_t RotateRight32(const uint32x4_t& val)
95 {
96  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
97  const uint32x4_t b(vshrq_n_u32(val, R));
98  return vorrq_u32(a, b);
99 }
100 
101 #if defined(__aarch32__) || defined(__aarch64__)
102 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
103 template <>
104 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
105 {
106 #if (CRYPTOPP_BIG_ENDIAN)
107  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
108  const uint8x16_t mask = vld1q_u8(maskb);
109 #else
110  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
111  const uint8x16_t mask = vld1q_u8(maskb);
112 #endif
113 
114  return vreinterpretq_u32_u8(
115  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
116 }
117 
118 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
119 template <>
120 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
121 {
122 #if (CRYPTOPP_BIG_ENDIAN)
123  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
124  const uint8x16_t mask = vld1q_u8(maskb);
125 #else
126  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
127  const uint8x16_t mask = vld1q_u8(maskb);
128 #endif
129 
130  return vreinterpretq_u32_u8(
131  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
132 }
133 #endif
134 
135 inline uint32x4_t SIMON64_f(const uint32x4_t& val)
136 {
137  return veorq_u32(RotateLeft32<2>(val),
138  vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
139 }
140 
141 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
142  const word32 *subkeys, unsigned int rounds)
143 {
144  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
145  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
146  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
147 
148  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
149  {
150  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
151  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
152 
153  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
154  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
155  }
156 
157  if (rounds & 1)
158  {
159  const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
160 
161  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
162  std::swap(x1, y1);
163  }
164 
165  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
166  block0 = UnpackLow32(y1, x1);
167  block1 = UnpackHigh32(y1, x1);
168 }
169 
170 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
171  const word32 *subkeys, unsigned int rounds)
172 {
173  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
174  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
175  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
176 
177  if (rounds & 1)
178  {
179  std::swap(x1, y1);
180  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
181 
182  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
183  rounds--;
184  }
185 
186  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
187  {
188  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
189  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
190 
191  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
192  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
193  }
194 
195  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
196  block0 = UnpackLow32(y1, x1);
197  block1 = UnpackHigh32(y1, x1);
198 }
199 
200 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
201  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
202  const word32 *subkeys, unsigned int rounds)
203 {
204  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
205  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
206  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
207  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
208  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
209  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
210  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
211 
212  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
213  {
214  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
215  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
216  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
217  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
218 
219  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
220  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
221  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
222  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
223  }
224 
225  if (rounds & 1)
226  {
227  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
228 
229  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
230  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
231  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
232  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
233  }
234 
235  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
236  block0 = UnpackLow32(y1, x1);
237  block1 = UnpackHigh32(y1, x1);
238  block2 = UnpackLow32(y2, x2);
239  block3 = UnpackHigh32(y2, x2);
240  block4 = UnpackLow32(y3, x3);
241  block5 = UnpackHigh32(y3, x3);
242 }
243 
244 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
245  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
246  const word32 *subkeys, unsigned int rounds)
247 {
248  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
249  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
250  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
251  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
252  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
253  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
254  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
255 
256  if (rounds & 1)
257  {
258  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
259  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
260 
261  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
262  y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
263  y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
264  rounds--;
265  }
266 
267  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
268  {
269  const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
270  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
271  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
272  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
273 
274  const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
275  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
276  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
277  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
278  }
279 
280  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
281  block0 = UnpackLow32(y1, x1);
282  block1 = UnpackHigh32(y1, x1);
283  block2 = UnpackLow32(y2, x2);
284  block3 = UnpackHigh32(y2, x2);
285  block4 = UnpackLow32(y3, x3);
286  block5 = UnpackHigh32(y3, x3);
287 }
288 
289 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
290 
291 // ***************************** IA-32 ***************************** //
292 
293 #if defined(CRYPTOPP_SSE41_AVAILABLE)
294 
295 inline void Swap128(__m128i& a,__m128i& b)
296 {
297 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
298  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
299  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
300  vec_swap(a, b);
301 #else
302  std::swap(a, b);
303 #endif
304 }
305 
306 template <unsigned int R>
307 inline __m128i RotateLeft32(const __m128i& val)
308 {
309 #if defined(__XOP__)
310  return _mm_roti_epi32(val, R);
311 #else
312  return _mm_or_si128(
313  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
314 #endif
315 }
316 
317 template <unsigned int R>
318 inline __m128i RotateRight32(const __m128i& val)
319 {
320 #if defined(__XOP__)
321  return _mm_roti_epi32(val, 32-R);
322 #else
323  return _mm_or_si128(
324  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
325 #endif
326 }
327 
328 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
329 template <>
330 __m128i RotateLeft32<8>(const __m128i& val)
331 {
332 #if defined(__XOP__)
333  return _mm_roti_epi32(val, 8);
334 #else
335  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
336  return _mm_shuffle_epi8(val, mask);
337 #endif
338 }
339 
340 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
341 template <>
342 __m128i RotateRight32<8>(const __m128i& val)
343 {
344 #if defined(__XOP__)
345  return _mm_roti_epi32(val, 32-8);
346 #else
347  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
348  return _mm_shuffle_epi8(val, mask);
349 #endif
350 }
351 
352 inline __m128i SIMON64_f(const __m128i& v)
353 {
354  return _mm_xor_si128(RotateLeft32<2>(v),
355  _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
356 }
357 
358 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
359  const word32 *subkeys, unsigned int rounds)
360 {
361  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
362  const __m128 t0 = _mm_castsi128_ps(block0);
363  const __m128 t1 = _mm_castsi128_ps(block1);
364  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
365  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
366 
367  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
368  {
369  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
370  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
371 
372  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
373  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
374  }
375 
376  if (rounds & 1)
377  {
378  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
379  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
380  Swap128(x1, y1);
381  }
382 
383  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
384  block0 = _mm_unpacklo_epi32(y1, x1);
385  block1 = _mm_unpackhi_epi32(y1, x1);
386 }
387 
388 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
389  const word32 *subkeys, unsigned int rounds)
390 {
391  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
392  const __m128 t0 = _mm_castsi128_ps(block0);
393  const __m128 t1 = _mm_castsi128_ps(block1);
394  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
395  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
396 
397  if (rounds & 1)
398  {
399  Swap128(x1, y1);
400  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
401  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
402  rounds--;
403  }
404 
405  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
406  {
407  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
408  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
409 
410  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
411  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
412  }
413 
414  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
415  block0 = _mm_unpacklo_epi32(y1, x1);
416  block1 = _mm_unpackhi_epi32(y1, x1);
417 }
418 
419 inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
420  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
421  const word32 *subkeys, unsigned int rounds)
422 {
423  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
424  const __m128 t0 = _mm_castsi128_ps(block0);
425  const __m128 t1 = _mm_castsi128_ps(block1);
426  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
427  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
428 
429  const __m128 t2 = _mm_castsi128_ps(block2);
430  const __m128 t3 = _mm_castsi128_ps(block3);
431  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
432  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
433 
434  const __m128 t4 = _mm_castsi128_ps(block4);
435  const __m128 t5 = _mm_castsi128_ps(block5);
436  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
437  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
438 
439  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
440  {
441  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
442  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
443  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
444  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
445 
446  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
447  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
448  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
449  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
450  }
451 
452  if (rounds & 1)
453  {
454  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
455  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
456  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
457  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
458  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
459  }
460 
461  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
462  block0 = _mm_unpacklo_epi32(y1, x1);
463  block1 = _mm_unpackhi_epi32(y1, x1);
464  block2 = _mm_unpacklo_epi32(y2, x2);
465  block3 = _mm_unpackhi_epi32(y2, x2);
466  block4 = _mm_unpacklo_epi32(y3, x3);
467  block5 = _mm_unpackhi_epi32(y3, x3);
468 }
469 
470 inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
471  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
472  const word32 *subkeys, unsigned int rounds)
473 {
474  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
475  const __m128 t0 = _mm_castsi128_ps(block0);
476  const __m128 t1 = _mm_castsi128_ps(block1);
477  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
478  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
479 
480  const __m128 t2 = _mm_castsi128_ps(block2);
481  const __m128 t3 = _mm_castsi128_ps(block3);
482  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
483  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
484 
485  const __m128 t4 = _mm_castsi128_ps(block4);
486  const __m128 t5 = _mm_castsi128_ps(block5);
487  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
488  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
489 
490  if (rounds & 1)
491  {
492  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
493  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
494  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
495  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
496  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
497  rounds--;
498  }
499 
500  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
501  {
502  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
503  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
504  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
505  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
506 
507  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
508  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
509  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
510  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
511  }
512 
513  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
514  block0 = _mm_unpacklo_epi32(y1, x1);
515  block1 = _mm_unpackhi_epi32(y1, x1);
516  block2 = _mm_unpacklo_epi32(y2, x2);
517  block3 = _mm_unpackhi_epi32(y2, x2);
518  block4 = _mm_unpacklo_epi32(y3, x3);
519  block5 = _mm_unpackhi_epi32(y3, x3);
520 }
521 
522 #endif // CRYPTOPP_SSE41_AVAILABLE
523 
524 // ***************************** Altivec ***************************** //
525 
526 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
527 
530 
531 using CryptoPP::VecAnd;
532 using CryptoPP::VecXor;
533 using CryptoPP::VecLoad;
534 using CryptoPP::VecLoadBE;
536 
537 // Rotate left by bit count
538 template<unsigned int C>
539 inline uint32x4_p RotateLeft32(const uint32x4_p val)
540 {
541  const uint32x4_p m = {C, C, C, C};
542  return vec_rl(val, m);
543 }
544 
545 // Rotate right by bit count
546 template<unsigned int C>
547 inline uint32x4_p RotateRight32(const uint32x4_p val)
548 {
549  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
550  return vec_rl(val, m);
551 }
552 
553 inline uint32x4_p SIMON64_f(const uint32x4_p val)
554 {
555  return VecXor(RotateLeft32<2>(val),
556  VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
557 }
558 
559 inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
560  const word32 *subkeys, unsigned int rounds)
561 {
562 #if (CRYPTOPP_BIG_ENDIAN)
563  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
564  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
565 #else
566  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
567  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
568 #endif
569 
570  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
571  uint32x4_p x1 = VecPermute(block0, block1, m1);
572  uint32x4_p y1 = VecPermute(block0, block1, m2);
573 
574  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
575  {
576 #if CRYPTOPP_POWER7_AVAILABLE
577  const uint32x4_p rk1 = vec_splats(subkeys[i]);
578  const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
579 #else
580  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
581  uint32x4_p rk1 = VecLoad(subkeys+i);
582  uint32x4_p rk2 = VecLoad(subkeys+i+1);
583  rk1 = VecPermute(rk1, rk1, m);
584  rk2 = VecPermute(rk2, rk2, m);
585 #endif
586  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
587  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
588  }
589 
590  if (rounds & 1)
591  {
592 #if CRYPTOPP_POWER7_AVAILABLE
593  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
594 #else
595  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
596  uint32x4_p rk = VecLoad(subkeys+rounds-1);
597  rk = VecPermute(rk, rk, m);
598 #endif
599  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
600  std::swap(x1, y1);
601  }
602 
603 #if (CRYPTOPP_BIG_ENDIAN)
604  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
605  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
606 #else
607  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
608  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
609 #endif
610 
611  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
612  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
613  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
614 }
615 
616 inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
617  const word32 *subkeys, unsigned int rounds)
618 {
619 #if (CRYPTOPP_BIG_ENDIAN)
620  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
621  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
622 #else
623  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
624  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
625 #endif
626 
627  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
628  uint32x4_p x1 = VecPermute(block0, block1, m1);
629  uint32x4_p y1 = VecPermute(block0, block1, m2);
630 
631  if (rounds & 1)
632  {
633  std::swap(x1, y1);
634 #if CRYPTOPP_POWER7_AVAILABLE
635  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
636 #else
637  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
638  uint32x4_p rk = VecLoad(subkeys+rounds-1);
639  rk = VecPermute(rk, rk, m);
640 #endif
641  y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
642  rounds--;
643  }
644 
645  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
646  {
647 #if CRYPTOPP_POWER7_AVAILABLE
648  const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
649  const uint32x4_p rk2 = vec_splats(subkeys[i]);
650 #else
651  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
652  uint32x4_p rk1 = VecLoad(subkeys+i+1);
653  uint32x4_p rk2 = VecLoad(subkeys+i);
654  rk1 = VecPermute(rk1, rk1, m);
655  rk2 = VecPermute(rk2, rk2, m);
656 #endif
657  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
658  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
659  }
660 
661 #if (CRYPTOPP_BIG_ENDIAN)
662  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
663  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
664 #else
665  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
666  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
667 #endif
668 
669  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
670  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
671  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
672 }
673 
674 inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
675  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
676  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
677 {
678 #if (CRYPTOPP_BIG_ENDIAN)
679  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
680  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
681 #else
682  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
683  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
684 #endif
685 
686  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
687  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
688  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
689  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
690  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
691  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
692  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
693 
694  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
695  {
696 #if CRYPTOPP_POWER7_AVAILABLE
697  const uint32x4_p rk1 = vec_splats(subkeys[i]);
698  const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
699 #else
700  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
701  uint32x4_p rk1 = VecLoad(subkeys+i);
702  uint32x4_p rk2 = VecLoad(subkeys+i+1);
703  rk1 = VecPermute(rk1, rk1, m);
704  rk2 = VecPermute(rk2, rk2, m);
705 #endif
706  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
707  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
708  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
709 
710  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
711  x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
712  x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
713  }
714 
715  if (rounds & 1)
716  {
717 #if CRYPTOPP_POWER7_AVAILABLE
718  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
719 #else
720  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
721  uint32x4_p rk = VecLoad(subkeys+rounds-1);
722  rk = VecPermute(rk, rk, m);
723 #endif
724  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
725  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
726  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
727  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
728  }
729 
730 #if (CRYPTOPP_BIG_ENDIAN)
731  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
732  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
733 #else
734  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
735  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
736 #endif
737 
738  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
739  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
740  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
741  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
742  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
743  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
744  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
745 }
746 
747 inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
748  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
749  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
750 {
751 #if (CRYPTOPP_BIG_ENDIAN)
752  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
753  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
754 #else
755  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
756  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
757 #endif
758 
759  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
760  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
761  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
762  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
763  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
764  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
765  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
766 
767  if (rounds & 1)
768  {
769  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
770 
771 #if CRYPTOPP_POWER7_AVAILABLE
772  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
773 #else
774  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
775  uint32x4_p rk = VecLoad(subkeys+rounds-1);
776  rk = VecPermute(rk, rk, m);
777 #endif
778  y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
779  y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
780  y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
781  rounds--;
782  }
783 
784  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
785  {
786 #if CRYPTOPP_POWER7_AVAILABLE
787  const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
788  const uint32x4_p rk2 = vec_splats(subkeys[i]);
789 #else
790  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
791  uint32x4_p rk1 = VecLoad(subkeys+i+1);
792  uint32x4_p rk2 = VecLoad(subkeys+i);
793  rk1 = VecPermute(rk1, rk1, m);
794  rk2 = VecPermute(rk2, rk2, m);
795 #endif
796  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
797  x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
798  x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
799 
800  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
801  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
802  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
803  }
804 
805 #if (CRYPTOPP_BIG_ENDIAN)
806  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
807  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
808 #else
809  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
810  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
811 #endif
812 
813  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
814  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
815  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
816  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
817  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
818  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
819  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
820 }
821 
822 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
823 
824 ANONYMOUS_NAMESPACE_END
825 
826 ///////////////////////////////////////////////////////////////////////
827 
828 NAMESPACE_BEGIN(CryptoPP)
829 
830 // *************************** ARM NEON **************************** //
831 
832 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
833 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
834  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
835 {
836  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
837  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
838 }
839 
840 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
841  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
842 {
843  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
844  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
845 }
846 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
847 
848 // ***************************** IA-32 ***************************** //
849 
850 #if defined(CRYPTOPP_SSE41_AVAILABLE)
851 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
852  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
853 {
854  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
855  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
856 }
857 
858 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
859  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
860 {
861  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
862  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
863 }
864 #endif
865 
866 // ***************************** Altivec ***************************** //
867 
868 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
869 size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
870  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
871 {
872  return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
873  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
874 }
875 
876 size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
877  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
878 {
879  return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
880  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
881 }
882 #endif
883 
884 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:438
Classes for the Simon block cipher.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:251
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:911
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:527