Crypto++  8.0
Free C++ class library of cryptographic schemes
speck64_simd.cpp
1 // speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both speck.cpp and speck-simd.cpp.
17 // #undef CRYPTOPP_SSE41_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__) && defined(__AVX512VL__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
39 # include <arm_neon.h>
40 #endif
41 
42 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
43 // compilers don't follow ACLE conventions for the include.
44 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
45 # include <stdint.h>
46 # include <arm_acle.h>
47 #endif
48 
49 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
50 # include "ppc_simd.h"
51 #endif
52 
53 // Squash MS LNK4221 and libtool warnings
54 extern const char SPECK64_SIMD_FNAME[] = __FILE__;
55 
56 ANONYMOUS_NAMESPACE_BEGIN
57 
58 using CryptoPP::byte;
59 using CryptoPP::word32;
60 using CryptoPP::word64;
61 
62 // *************************** ARM NEON ************************** //
63 
64 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
65 
66 template <class T>
67 inline T UnpackHigh32(const T& a, const T& b)
68 {
69  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
70  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
71  const uint32x2x2_t r = vzip_u32(x, y);
72  return (T)vcombine_u32(r.val[0], r.val[1]);
73 }
74 
75 template <class T>
76 inline T UnpackLow32(const T& a, const T& b)
77 {
78  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
79  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
80  const uint32x2x2_t r = vzip_u32(x, y);
81  return (T)vcombine_u32(r.val[0], r.val[1]);
82 }
83 
84 template <unsigned int R>
85 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
86 {
87  const uint32x4_t a(vshlq_n_u32(val, R));
88  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
89  return vorrq_u32(a, b);
90 }
91 
92 template <unsigned int R>
93 inline uint32x4_t RotateRight32(const uint32x4_t& val)
94 {
95  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
96  const uint32x4_t b(vshrq_n_u32(val, R));
97  return vorrq_u32(a, b);
98 }
99 
100 #if defined(__aarch32__) || defined(__aarch64__)
101 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
102 template <>
103 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
104 {
105 #if (CRYPTOPP_BIG_ENDIAN)
106  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
107  const uint8x16_t mask = vld1q_u8(maskb);
108 #else
109  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
110  const uint8x16_t mask = vld1q_u8(maskb);
111 #endif
112 
113  return vreinterpretq_u32_u8(
114  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
115 }
116 
117 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
118 template <>
119 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
120 {
121 #if (CRYPTOPP_BIG_ENDIAN)
122  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
123  const uint8x16_t mask = vld1q_u8(maskb);
124 #else
125  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
126  const uint8x16_t mask = vld1q_u8(maskb);
127 #endif
128 
129  return vreinterpretq_u32_u8(
130  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
131 }
132 #endif // Aarch32 or Aarch64
133 
134 inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
135  const word32 *subkeys, unsigned int rounds)
136 {
137  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
138  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
139  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
140 
141  for (int i=0; i < static_cast<int>(rounds); ++i)
142  {
143  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
144 
145  x1 = RotateRight32<8>(x1);
146  x1 = vaddq_u32(x1, y1);
147  x1 = veorq_u32(x1, rk);
148  y1 = RotateLeft32<3>(y1);
149  y1 = veorq_u32(y1, x1);
150  }
151 
152  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
153  block0 = UnpackLow32(y1, x1);
154  block1 = UnpackHigh32(y1, x1);
155 }
156 
157 inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
158  const word32 *subkeys, unsigned int rounds)
159 {
160  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
161  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
162  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
163 
164  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
165  {
166  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
167 
168  y1 = veorq_u32(y1, x1);
169  y1 = RotateRight32<3>(y1);
170  x1 = veorq_u32(x1, rk);
171  x1 = vsubq_u32(x1, y1);
172  x1 = RotateLeft32<8>(x1);
173  }
174 
175  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
176  block0 = UnpackLow32(y1, x1);
177  block1 = UnpackHigh32(y1, x1);
178 }
179 
180 inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
181  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
182  const word32 *subkeys, unsigned int rounds)
183 {
184  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
185  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
186  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
187  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
188  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
189  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
190  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
191 
192  for (int i=0; i < static_cast<int>(rounds); ++i)
193  {
194  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
195 
196  x1 = RotateRight32<8>(x1);
197  x2 = RotateRight32<8>(x2);
198  x3 = RotateRight32<8>(x3);
199  x1 = vaddq_u32(x1, y1);
200  x2 = vaddq_u32(x2, y2);
201  x3 = vaddq_u32(x3, y3);
202  x1 = veorq_u32(x1, rk);
203  x2 = veorq_u32(x2, rk);
204  x3 = veorq_u32(x3, rk);
205  y1 = RotateLeft32<3>(y1);
206  y2 = RotateLeft32<3>(y2);
207  y3 = RotateLeft32<3>(y3);
208  y1 = veorq_u32(y1, x1);
209  y2 = veorq_u32(y2, x2);
210  y3 = veorq_u32(y3, x3);
211  }
212 
213  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
214  block0 = UnpackLow32(y1, x1);
215  block1 = UnpackHigh32(y1, x1);
216  block2 = UnpackLow32(y2, x2);
217  block3 = UnpackHigh32(y2, x2);
218  block4 = UnpackLow32(y3, x3);
219  block5 = UnpackHigh32(y3, x3);
220 }
221 
222 inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
223  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
224  const word32 *subkeys, unsigned int rounds)
225 {
226  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
227  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
228  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
229  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
230  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
231  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
232  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
233 
234  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
235  {
236  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
237 
238  y1 = veorq_u32(y1, x1);
239  y2 = veorq_u32(y2, x2);
240  y3 = veorq_u32(y3, x3);
241  y1 = RotateRight32<3>(y1);
242  y2 = RotateRight32<3>(y2);
243  y3 = RotateRight32<3>(y3);
244  x1 = veorq_u32(x1, rk);
245  x2 = veorq_u32(x2, rk);
246  x3 = veorq_u32(x3, rk);
247  x1 = vsubq_u32(x1, y1);
248  x2 = vsubq_u32(x2, y2);
249  x3 = vsubq_u32(x3, y3);
250  x1 = RotateLeft32<8>(x1);
251  x2 = RotateLeft32<8>(x2);
252  x3 = RotateLeft32<8>(x3);
253  }
254 
255  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
256  block0 = UnpackLow32(y1, x1);
257  block1 = UnpackHigh32(y1, x1);
258  block2 = UnpackLow32(y2, x2);
259  block3 = UnpackHigh32(y2, x2);
260  block4 = UnpackLow32(y3, x3);
261  block5 = UnpackHigh32(y3, x3);
262 }
263 
264 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
265 
266 // ***************************** IA-32 ***************************** //
267 
268 #if defined(CRYPTOPP_SSE41_AVAILABLE)
269 
270 template <unsigned int R>
271 inline __m128i RotateLeft32(const __m128i& val)
272 {
273 #if defined(__XOP__)
274  return _mm_roti_epi32(val, R);
275 #else
276  return _mm_or_si128(
277  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
278 #endif
279 }
280 
281 template <unsigned int R>
282 inline __m128i RotateRight32(const __m128i& val)
283 {
284 #if defined(__XOP__)
285  return _mm_roti_epi32(val, 32-R);
286 #else
287  return _mm_or_si128(
288  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
289 #endif
290 }
291 
292 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
293 template <>
294 __m128i RotateLeft32<8>(const __m128i& val)
295 {
296 #if defined(__XOP__)
297  return _mm_roti_epi32(val, 8);
298 #else
299  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
300  return _mm_shuffle_epi8(val, mask);
301 #endif
302 }
303 
304 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
305 template <>
306 __m128i RotateRight32<8>(const __m128i& val)
307 {
308 #if defined(__XOP__)
309  return _mm_roti_epi32(val, 32-8);
310 #else
311  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
312  return _mm_shuffle_epi8(val, mask);
313 #endif
314 }
315 
316 inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
317  const word32 *subkeys, unsigned int rounds)
318 {
319  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
320  const __m128 t0 = _mm_castsi128_ps(block0);
321  const __m128 t1 = _mm_castsi128_ps(block1);
322  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
323  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
324 
325  for (int i=0; i < static_cast<int>(rounds); ++i)
326  {
327  const __m128i rk = _mm_set1_epi32(subkeys[i]);
328 
329  x1 = RotateRight32<8>(x1);
330  x1 = _mm_add_epi32(x1, y1);
331  x1 = _mm_xor_si128(x1, rk);
332  y1 = RotateLeft32<3>(y1);
333  y1 = _mm_xor_si128(y1, x1);
334  }
335 
336  // The is roughly the SSE equivalent to ARM vzp32
337  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
338  block0 = _mm_unpacklo_epi32(y1, x1);
339  block1 = _mm_unpackhi_epi32(y1, x1);
340 }
341 
342 inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
343  const word32 *subkeys, unsigned int rounds)
344 {
345  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
346  const __m128 t0 = _mm_castsi128_ps(block0);
347  const __m128 t1 = _mm_castsi128_ps(block1);
348  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
349  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
350 
351  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
352  {
353  const __m128i rk = _mm_set1_epi32(subkeys[i]);
354 
355  y1 = _mm_xor_si128(y1, x1);
356  y1 = RotateRight32<3>(y1);
357  x1 = _mm_xor_si128(x1, rk);
358  x1 = _mm_sub_epi32(x1, y1);
359  x1 = RotateLeft32<8>(x1);
360  }
361 
362  // The is roughly the SSE equivalent to ARM vzp32
363  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
364  block0 = _mm_unpacklo_epi32(y1, x1);
365  block1 = _mm_unpackhi_epi32(y1, x1);
366 }
367 
368 inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
369  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
370  const word32 *subkeys, unsigned int rounds)
371 {
372  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
373  const __m128 t0 = _mm_castsi128_ps(block0);
374  const __m128 t1 = _mm_castsi128_ps(block1);
375  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
376  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
377 
378  const __m128 t2 = _mm_castsi128_ps(block2);
379  const __m128 t3 = _mm_castsi128_ps(block3);
380  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
381  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
382 
383  const __m128 t4 = _mm_castsi128_ps(block4);
384  const __m128 t5 = _mm_castsi128_ps(block5);
385  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
386  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
387 
388  for (int i=0; i < static_cast<int>(rounds); ++i)
389  {
390  const __m128i rk = _mm_set1_epi32(subkeys[i]);
391 
392  x1 = RotateRight32<8>(x1);
393  x2 = RotateRight32<8>(x2);
394  x3 = RotateRight32<8>(x3);
395  x1 = _mm_add_epi32(x1, y1);
396  x2 = _mm_add_epi32(x2, y2);
397  x3 = _mm_add_epi32(x3, y3);
398  x1 = _mm_xor_si128(x1, rk);
399  x2 = _mm_xor_si128(x2, rk);
400  x3 = _mm_xor_si128(x3, rk);
401  y1 = RotateLeft32<3>(y1);
402  y2 = RotateLeft32<3>(y2);
403  y3 = RotateLeft32<3>(y3);
404  y1 = _mm_xor_si128(y1, x1);
405  y2 = _mm_xor_si128(y2, x2);
406  y3 = _mm_xor_si128(y3, x3);
407  }
408 
409  // The is roughly the SSE equivalent to ARM vzp32
410  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
411  block0 = _mm_unpacklo_epi32(y1, x1);
412  block1 = _mm_unpackhi_epi32(y1, x1);
413  block2 = _mm_unpacklo_epi32(y2, x2);
414  block3 = _mm_unpackhi_epi32(y2, x2);
415  block4 = _mm_unpacklo_epi32(y3, x3);
416  block5 = _mm_unpackhi_epi32(y3, x3);
417 }
418 
419 inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
420  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
421  const word32 *subkeys, unsigned int rounds)
422 {
423  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
424  const __m128 t0 = _mm_castsi128_ps(block0);
425  const __m128 t1 = _mm_castsi128_ps(block1);
426  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
427  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
428 
429  const __m128 t2 = _mm_castsi128_ps(block2);
430  const __m128 t3 = _mm_castsi128_ps(block3);
431  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
432  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
433 
434  const __m128 t4 = _mm_castsi128_ps(block4);
435  const __m128 t5 = _mm_castsi128_ps(block5);
436  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
437  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
438 
439  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
440  {
441  const __m128i rk = _mm_set1_epi32(subkeys[i]);
442 
443  y1 = _mm_xor_si128(y1, x1);
444  y2 = _mm_xor_si128(y2, x2);
445  y3 = _mm_xor_si128(y3, x3);
446  y1 = RotateRight32<3>(y1);
447  y2 = RotateRight32<3>(y2);
448  y3 = RotateRight32<3>(y3);
449  x1 = _mm_xor_si128(x1, rk);
450  x2 = _mm_xor_si128(x2, rk);
451  x3 = _mm_xor_si128(x3, rk);
452  x1 = _mm_sub_epi32(x1, y1);
453  x2 = _mm_sub_epi32(x2, y2);
454  x3 = _mm_sub_epi32(x3, y3);
455  x1 = RotateLeft32<8>(x1);
456  x2 = RotateLeft32<8>(x2);
457  x3 = RotateLeft32<8>(x3);
458  }
459 
460  // The is roughly the SSE equivalent to ARM vzp32
461  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
462  block0 = _mm_unpacklo_epi32(y1, x1);
463  block1 = _mm_unpackhi_epi32(y1, x1);
464  block2 = _mm_unpacklo_epi32(y2, x2);
465  block3 = _mm_unpackhi_epi32(y2, x2);
466  block4 = _mm_unpacklo_epi32(y3, x3);
467  block5 = _mm_unpackhi_epi32(y3, x3);
468 }
469 
470 #endif // CRYPTOPP_SSE41_AVAILABLE
471 
472 // ***************************** Altivec ***************************** //
473 
474 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
477 
478 using CryptoPP::VecAdd;
479 using CryptoPP::VecSub;
480 using CryptoPP::VecXor;
481 using CryptoPP::VecLoad;
483 
484 // Rotate left by bit count
485 template<unsigned int C>
486 inline uint32x4_p RotateLeft32(const uint32x4_p val)
487 {
488  const uint32x4_p m = {C, C, C, C};
489  return vec_rl(val, m);
490 }
491 
492 // Rotate right by bit count
493 template<unsigned int C>
494 inline uint32x4_p RotateRight32(const uint32x4_p val)
495 {
496  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
497  return vec_rl(val, m);
498 }
499 
500 void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
501  const word32 *subkeys, unsigned int rounds)
502 {
503 #if (CRYPTOPP_BIG_ENDIAN)
504  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
505  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
506 #else
507  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
508  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
509 #endif
510 
511  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
512  uint32x4_p x1 = VecPermute(block0, block1, m1);
513  uint32x4_p y1 = VecPermute(block0, block1, m2);
514 
515  for (int i=0; i < static_cast<int>(rounds); ++i)
516  {
517 #if CRYPTOPP_POWER7_AVAILABLE
518  const uint32x4_p rk = vec_splats(subkeys[i]);
519 #else
520  // subkeys has extra elements so memory backs the last subkey
521  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
522  uint32x4_p rk = VecLoad(subkeys+i);
523  rk = VecPermute(rk, rk, m);
524 #endif
525 
526  x1 = RotateRight32<8>(x1);
527  x1 = VecAdd(x1, y1);
528  x1 = VecXor(x1, rk);
529 
530  y1 = RotateLeft32<3>(y1);
531  y1 = VecXor(y1, x1);
532  }
533 
534 #if (CRYPTOPP_BIG_ENDIAN)
535  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
536  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
537 #else
538  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
539  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
540 #endif
541 
542  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
543  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
544  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
545 }
546 
547 void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
548  const word32 *subkeys, unsigned int rounds)
549 {
550 #if (CRYPTOPP_BIG_ENDIAN)
551  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
552  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
553 #else
554  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
555  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
556 #endif
557 
558  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
559  uint32x4_p x1 = VecPermute(block0, block1, m1);
560  uint32x4_p y1 = VecPermute(block0, block1, m2);
561 
562  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
563  {
564 #if CRYPTOPP_POWER7_AVAILABLE
565  const uint32x4_p rk = vec_splats(subkeys[i]);
566 #else
567  // subkeys has extra elements so memory backs the last subkey
568  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
569  uint32x4_p rk = VecLoad(subkeys+i);
570  rk = VecPermute(rk, rk, m);
571 #endif
572 
573  y1 = VecXor(y1, x1);
574  y1 = RotateRight32<3>(y1);
575 
576  x1 = VecXor(x1, rk);
577  x1 = VecSub(x1, y1);
578  x1 = RotateLeft32<8>(x1);
579  }
580 
581 #if (CRYPTOPP_BIG_ENDIAN)
582  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
583  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
584 #else
585  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
586  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
587 #endif
588 
589  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
590  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
591  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
592 }
593 
594 void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
595  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
596  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
597 {
598 #if (CRYPTOPP_BIG_ENDIAN)
599  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
600  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
601 #else
602  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
603  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
604 #endif
605 
606  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
607  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
608  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
609  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
610  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
611  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
612  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
613 
614  for (int i=0; i < static_cast<int>(rounds); ++i)
615  {
616 #if CRYPTOPP_POWER7_AVAILABLE
617  const uint32x4_p rk = vec_splats(subkeys[i]);
618 #else
619  // subkeys has extra elements so memory backs the last subkey
620  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
621  uint32x4_p rk = VecLoad(subkeys+i);
622  rk = VecPermute(rk, rk, m);
623 #endif
624 
625  x1 = RotateRight32<8>(x1);
626  x2 = RotateRight32<8>(x2);
627  x3 = RotateRight32<8>(x3);
628 
629  x1 = VecAdd(x1, y1);
630  x2 = VecAdd(x2, y2);
631  x3 = VecAdd(x3, y3);
632 
633  x1 = VecXor(x1, rk);
634  x2 = VecXor(x2, rk);
635  x3 = VecXor(x3, rk);
636 
637  y1 = RotateLeft32<3>(y1);
638  y2 = RotateLeft32<3>(y2);
639  y3 = RotateLeft32<3>(y3);
640 
641  y1 = VecXor(y1, x1);
642  y2 = VecXor(y2, x2);
643  y3 = VecXor(y3, x3);
644  }
645 
646 #if (CRYPTOPP_BIG_ENDIAN)
647  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
648  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
649 #else
650  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
651  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
652 #endif
653 
654  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
655  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
656  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
657  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
658  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
659  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
660  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
661 }
662 
663 void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
664  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
665  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
666 {
667 #if (CRYPTOPP_BIG_ENDIAN)
668  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
669  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
670 #else
671  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
672  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
673 #endif
674 
675  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
676  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
677  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
678  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
679  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
680  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
681  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
682 
683  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
684  {
685 #if CRYPTOPP_POWER7_AVAILABLE
686  const uint32x4_p rk = vec_splats(subkeys[i]);
687 #else
688  // subkeys has extra elements so memory backs the last subkey
689  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
690  uint32x4_p rk = VecLoad(subkeys+i);
691  rk = VecPermute(rk, rk, m);
692 #endif
693 
694  y1 = VecXor(y1, x1);
695  y2 = VecXor(y2, x2);
696  y3 = VecXor(y3, x3);
697 
698  y1 = RotateRight32<3>(y1);
699  y2 = RotateRight32<3>(y2);
700  y3 = RotateRight32<3>(y3);
701 
702  x1 = VecXor(x1, rk);
703  x2 = VecXor(x2, rk);
704  x3 = VecXor(x3, rk);
705 
706  x1 = VecSub(x1, y1);
707  x2 = VecSub(x2, y2);
708  x3 = VecSub(x3, y3);
709 
710  x1 = RotateLeft32<8>(x1);
711  x2 = RotateLeft32<8>(x2);
712  x3 = RotateLeft32<8>(x3);
713  }
714 
715 #if (CRYPTOPP_BIG_ENDIAN)
716  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
717  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
718 #else
719  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
720  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
721 #endif
722 
723  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
724  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
725  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
726  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
727  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
728  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
729  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
730 }
731 
732 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
733 
734 ANONYMOUS_NAMESPACE_END
735 
736 ///////////////////////////////////////////////////////////////////////
737 
738 NAMESPACE_BEGIN(CryptoPP)
739 
740 // *************************** ARM NEON **************************** //
741 
742 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
743 size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
744  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
745 {
746  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
747  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
748 }
749 
750 size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
751  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
752 {
753  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
754  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
755 }
756 #endif
757 
758 // ***************************** IA-32 ***************************** //
759 
760 #if defined(CRYPTOPP_SSE41_AVAILABLE)
761 size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
762  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
763 {
764  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
765  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
766 }
767 
768 size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
769  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
770 {
771  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
772  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
773 }
774 #endif
775 
776 // ***************************** Altivec ***************************** //
777 
778 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
779 size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
780  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
781 {
782  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
783  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
784 }
785 
786 size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
787  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
788 {
789  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
790  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
791 }
792 #endif
793 
794 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:980
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:251
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118