20 #if (CRYPTOPP_SSSE3_AVAILABLE) 21 # include <pmmintrin.h> 22 # include <tmmintrin.h> 25 #if (CRYPTOPP_SSE41_AVAILABLE) 26 # include <smmintrin.h> 30 # include <ammintrin.h> 33 #if defined(__AVX512F__) && defined(__AVX512VL__) 34 # define CRYPTOPP_AVX512_ROTATE 1 35 # include <immintrin.h> 38 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 39 # include <arm_neon.h> 44 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 46 # include <arm_acle.h> 49 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 54 extern const char SIMON64_SIMD_FNAME[] = __FILE__;
56 ANONYMOUS_NAMESPACE_BEGIN
59 using CryptoPP::word32;
60 using CryptoPP::word64;
65 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 68 inline T UnpackHigh32(
const T& a,
const T& b)
70 const uint32x2_t x(vget_high_u32((uint32x4_t)a));
71 const uint32x2_t y(vget_high_u32((uint32x4_t)b));
72 const uint32x2x2_t r = vzip_u32(x, y);
73 return (T)vcombine_u32(r.val[0], r.val[1]);
77 inline T UnpackLow32(
const T& a,
const T& b)
79 const uint32x2_t x(vget_low_u32((uint32x4_t)a));
80 const uint32x2_t y(vget_low_u32((uint32x4_t)b));
81 const uint32x2x2_t r = vzip_u32(x, y);
82 return (T)vcombine_u32(r.val[0], r.val[1]);
85 template <
unsigned int R>
86 inline uint32x4_t RotateLeft32(
const uint32x4_t& val)
88 const uint32x4_t a(vshlq_n_u32(val, R));
89 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
90 return vorrq_u32(a, b);
93 template <
unsigned int R>
94 inline uint32x4_t RotateRight32(
const uint32x4_t& val)
96 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
97 const uint32x4_t b(vshrq_n_u32(val, R));
98 return vorrq_u32(a, b);
101 #if defined(__aarch32__) || defined(__aarch64__) 104 inline uint32x4_t RotateLeft32<8>(
const uint32x4_t& val)
106 #if (CRYPTOPP_BIG_ENDIAN) 107 const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
108 const uint8x16_t mask = vld1q_u8(maskb);
110 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
111 const uint8x16_t mask = vld1q_u8(maskb);
114 return vreinterpretq_u32_u8(
115 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
120 inline uint32x4_t RotateRight32<8>(
const uint32x4_t& val)
122 #if (CRYPTOPP_BIG_ENDIAN) 123 const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
124 const uint8x16_t mask = vld1q_u8(maskb);
126 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
127 const uint8x16_t mask = vld1q_u8(maskb);
130 return vreinterpretq_u32_u8(
131 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
135 inline uint32x4_t SIMON64_f(
const uint32x4_t& val)
137 return veorq_u32(RotateLeft32<2>(val),
138 vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
141 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
142 const word32 *subkeys,
unsigned int rounds)
145 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
146 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
148 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
150 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
151 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
153 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
154 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
159 const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
161 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
166 block0 = UnpackLow32(y1, x1);
167 block1 = UnpackHigh32(y1, x1);
170 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
171 const word32 *subkeys,
unsigned int rounds)
174 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
175 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
180 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
182 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
186 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
188 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
189 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
191 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
192 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
196 block0 = UnpackLow32(y1, x1);
197 block1 = UnpackHigh32(y1, x1);
200 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
201 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
202 const word32 *subkeys,
unsigned int rounds)
205 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
206 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
207 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
208 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
209 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
210 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
212 for (
int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
214 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
215 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
216 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
217 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
219 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
220 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
221 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
222 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
227 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
229 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
230 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
231 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
232 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
236 block0 = UnpackLow32(y1, x1);
237 block1 = UnpackHigh32(y1, x1);
238 block2 = UnpackLow32(y2, x2);
239 block3 = UnpackHigh32(y2, x2);
240 block4 = UnpackLow32(y3, x3);
241 block5 = UnpackHigh32(y3, x3);
244 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
245 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
246 const word32 *subkeys,
unsigned int rounds)
249 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
250 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
251 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
252 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
253 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
254 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
258 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
259 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
261 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
262 y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
263 y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
267 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
269 const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
270 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
271 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
272 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
274 const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
275 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
276 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
277 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
281 block0 = UnpackLow32(y1, x1);
282 block1 = UnpackHigh32(y1, x1);
283 block2 = UnpackLow32(y2, x2);
284 block3 = UnpackHigh32(y2, x2);
285 block4 = UnpackLow32(y3, x3);
286 block5 = UnpackHigh32(y3, x3);
289 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 293 #if defined(CRYPTOPP_SSE41_AVAILABLE) 295 inline void Swap128(__m128i& a,__m128i& b)
297 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) 306 template <
unsigned int R>
307 inline __m128i RotateLeft32(
const __m128i& val)
310 return _mm_roti_epi32(val, R);
313 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
317 template <
unsigned int R>
318 inline __m128i RotateRight32(
const __m128i& val)
321 return _mm_roti_epi32(val, 32-R);
324 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
330 __m128i RotateLeft32<8>(
const __m128i& val)
333 return _mm_roti_epi32(val, 8);
335 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
336 return _mm_shuffle_epi8(val, mask);
342 __m128i RotateRight32<8>(
const __m128i& val)
345 return _mm_roti_epi32(val, 32-8);
347 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
348 return _mm_shuffle_epi8(val, mask);
352 inline __m128i SIMON64_f(
const __m128i& v)
354 return _mm_xor_si128(RotateLeft32<2>(v),
355 _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
358 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
359 const word32 *subkeys,
unsigned int rounds)
362 const __m128 t0 = _mm_castsi128_ps(block0);
363 const __m128 t1 = _mm_castsi128_ps(block1);
364 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
365 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
367 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
369 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
370 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
372 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
373 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
378 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
379 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
384 block0 = _mm_unpacklo_epi32(y1, x1);
385 block1 = _mm_unpackhi_epi32(y1, x1);
388 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
389 const word32 *subkeys,
unsigned int rounds)
392 const __m128 t0 = _mm_castsi128_ps(block0);
393 const __m128 t1 = _mm_castsi128_ps(block1);
394 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
395 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
400 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
401 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
405 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
407 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
408 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
410 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
411 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
415 block0 = _mm_unpacklo_epi32(y1, x1);
416 block1 = _mm_unpackhi_epi32(y1, x1);
419 inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
420 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
421 const word32 *subkeys,
unsigned int rounds)
424 const __m128 t0 = _mm_castsi128_ps(block0);
425 const __m128 t1 = _mm_castsi128_ps(block1);
426 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
427 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
429 const __m128 t2 = _mm_castsi128_ps(block2);
430 const __m128 t3 = _mm_castsi128_ps(block3);
431 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
432 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
434 const __m128 t4 = _mm_castsi128_ps(block4);
435 const __m128 t5 = _mm_castsi128_ps(block5);
436 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
437 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
439 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
441 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
442 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
443 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
444 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
446 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
447 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
448 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
449 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
454 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
455 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
456 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
457 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
458 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
462 block0 = _mm_unpacklo_epi32(y1, x1);
463 block1 = _mm_unpackhi_epi32(y1, x1);
464 block2 = _mm_unpacklo_epi32(y2, x2);
465 block3 = _mm_unpackhi_epi32(y2, x2);
466 block4 = _mm_unpacklo_epi32(y3, x3);
467 block5 = _mm_unpackhi_epi32(y3, x3);
470 inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
471 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
472 const word32 *subkeys,
unsigned int rounds)
475 const __m128 t0 = _mm_castsi128_ps(block0);
476 const __m128 t1 = _mm_castsi128_ps(block1);
477 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
478 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
480 const __m128 t2 = _mm_castsi128_ps(block2);
481 const __m128 t3 = _mm_castsi128_ps(block3);
482 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
483 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
485 const __m128 t4 = _mm_castsi128_ps(block4);
486 const __m128 t5 = _mm_castsi128_ps(block5);
487 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
488 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
492 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
493 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
494 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
495 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
496 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
500 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
502 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
503 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
504 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
505 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
507 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
508 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
509 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
510 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
514 block0 = _mm_unpacklo_epi32(y1, x1);
515 block1 = _mm_unpackhi_epi32(y1, x1);
516 block2 = _mm_unpacklo_epi32(y2, x2);
517 block3 = _mm_unpackhi_epi32(y2, x2);
518 block4 = _mm_unpacklo_epi32(y3, x3);
519 block5 = _mm_unpackhi_epi32(y3, x3);
522 #endif // CRYPTOPP_SSE41_AVAILABLE 526 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 538 template<
unsigned int C>
542 return vec_rl(val, m);
546 template<
unsigned int C>
549 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
550 return vec_rl(val, m);
555 return VecXor(RotateLeft32<2>(val),
556 VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
560 const word32 *subkeys,
unsigned int rounds)
562 #if (CRYPTOPP_BIG_ENDIAN) 563 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
564 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
566 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
567 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
574 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
576 #if CRYPTOPP_POWER7_AVAILABLE 577 const uint32x4_p rk1 = vec_splats(subkeys[i]);
578 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
580 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
592 #if CRYPTOPP_POWER7_AVAILABLE 593 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
595 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
603 #if (CRYPTOPP_BIG_ENDIAN) 604 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
605 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
607 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
608 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
617 const word32 *subkeys,
unsigned int rounds)
619 #if (CRYPTOPP_BIG_ENDIAN) 620 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
621 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
623 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
624 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
634 #if CRYPTOPP_POWER7_AVAILABLE 635 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
637 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
645 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
647 #if CRYPTOPP_POWER7_AVAILABLE 648 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
649 const uint32x4_p rk2 = vec_splats(subkeys[i]);
651 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
661 #if (CRYPTOPP_BIG_ENDIAN) 662 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
663 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
665 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
666 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
676 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
678 #if (CRYPTOPP_BIG_ENDIAN) 679 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
680 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
682 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
683 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
694 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
696 #if CRYPTOPP_POWER7_AVAILABLE 697 const uint32x4_p rk1 = vec_splats(subkeys[i]);
698 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
700 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
717 #if CRYPTOPP_POWER7_AVAILABLE 718 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
720 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
727 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
730 #if (CRYPTOPP_BIG_ENDIAN) 731 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
732 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
734 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
735 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
749 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
751 #if (CRYPTOPP_BIG_ENDIAN) 752 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
753 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
755 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
756 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
769 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
771 #if CRYPTOPP_POWER7_AVAILABLE 772 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
774 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
784 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
786 #if CRYPTOPP_POWER7_AVAILABLE 787 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
788 const uint32x4_p rk2 = vec_splats(subkeys[i]);
790 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
805 #if (CRYPTOPP_BIG_ENDIAN) 806 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
807 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
809 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
810 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
822 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 824 ANONYMOUS_NAMESPACE_END
832 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 833 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
834 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
836 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
837 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
840 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
841 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
843 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
844 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
846 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 850 #if defined(CRYPTOPP_SSE41_AVAILABLE) 851 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
852 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
854 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
855 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
858 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
859 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
861 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
862 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
868 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 869 size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(
const word32* subKeys,
size_t rounds,
870 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
872 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
873 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
876 size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(
const word32* subKeys,
size_t rounds,
877 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
879 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
880 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Classes for the Simon block cipher.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.