Crypto++  8.0
Free C++ class library of cryptographic schemes
lea_simd.cpp
1 // lea_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "lea.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both simon.cpp and simon-simd.cpp.
17 // #undef CRYPTOPP_SSSE3_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__) && defined(__AVX512VL__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
35 # include <arm_neon.h>
36 #endif
37 
38 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
39 // compilers don't follow ACLE conventions for the include.
40 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
41 # include <stdint.h>
42 # include <arm_acle.h>
43 #endif
44 
45 // Do not port this to POWER architecture. Naively we hoped
46 // for a 2x to 3x speedup. The result was a 5x slow down
47 // because of the rotates and scattered loads.
48 //
49 // C++:
50 // <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64<TD>0.593<TD>2015
51 // <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48<TD>0.699<TD>2378
52 // <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2<TD>0.842<TD>2861
53 //
54 // Power8:
55 // <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7<TD>0.595<TD>2023
56 // <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1<TD>0.699<TD>2375
57 // <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0<TD>1.006<TD>3419
58 
59 #undef CRYPTOPP_POWER8_AVAILABLE
60 #if defined(CRYPTOPP_POWER8_AVAILABLE)
61 # include "ppc_simd.h"
62 #endif
63 
64 // Squash MS LNK4221 and libtool warnings
65 extern const char LEA_SIMD_FNAME[] = __FILE__;
66 
67 ANONYMOUS_NAMESPACE_BEGIN
68 
69 using CryptoPP::word32;
70 
71 // *************************** ARM NEON ***************************//
72 
73 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
74 
75 inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
76 {
77  return veorq_u32(a, b);
78 }
79 
80 inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
81 {
82  return vaddq_u32(a, b);
83 }
84 
85 inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
86 {
87  return vsubq_u32(a, b);
88 }
89 
90 template <unsigned int R>
91 inline uint32x4_t RotateLeft(const uint32x4_t& val)
92 {
93  const uint32x4_t a(vshlq_n_u32(val, R));
94  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
95  return vorrq_u32(a, b);
96 }
97 
98 template <unsigned int R>
99 inline uint32x4_t RotateRight(const uint32x4_t& val)
100 {
101  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
102  const uint32x4_t b(vshrq_n_u32(val, R));
103  return vorrq_u32(a, b);
104 }
105 
106 #if defined(__aarch32__) || defined(__aarch64__)
107 template <>
108 inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
109 {
110 #if (CRYPTOPP_BIG_ENDIAN)
111  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
112  const uint8x16_t mask = vld1q_u8(maskb);
113 #else
114  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
115  const uint8x16_t mask = vld1q_u8(maskb);
116 #endif
117 
118  return vreinterpretq_u32_u8(
119  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
120 }
121 
122 template <>
123 inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
124 {
125 #if (CRYPTOPP_BIG_ENDIAN)
126  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
127  const uint8x16_t mask = vld1q_u8(maskb);
128 #else
129  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
130  const uint8x16_t mask = vld1q_u8(maskb);
131 #endif
132 
133  return vreinterpretq_u32_u8(
134  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
135 }
136 #endif
137 
138 uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
139 {
140  uint32x2_t a1 = vget_low_u32(a);
141  uint32x2_t b1 = vget_low_u32(b);
142  uint32x2x2_t result = vzip_u32(a1, b1);
143  return vcombine_u32(result.val[0], result.val[1]);
144 }
145 
146 uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
147 {
148  uint32x2_t a1 = vget_high_u32(a);
149  uint32x2_t b1 = vget_high_u32(b);
150  uint32x2x2_t result = vzip_u32(a1, b1);
151  return vcombine_u32(result.val[0], result.val[1]);
152 }
153 
154 uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
155 {
156  uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
157  uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
158  return (uint32x4_t)vcombine_u64(a1, b1);
159 }
160 
161 uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
162 {
163  uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
164  uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
165  return (uint32x4_t)vcombine_u64(a1, b1);
166 }
167 
168 template <unsigned int IDX>
169 inline uint32x4_t LoadKey(const word32 rkey[])
170 {
171  return vdupq_n_u32(rkey[IDX]);
172 }
173 
174 template <unsigned int IDX>
175 inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
176 {
177  // Should not be instantiated
178  CRYPTOPP_ASSERT(0);;
179  return vmovq_n_u32(0);
180 }
181 
182 template <>
183 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
184 {
185  const uint32x4_t r1 = UnpackLow32(a, b);
186  const uint32x4_t r2 = UnpackLow32(c, d);
187  return UnpackLow64(r1, r2);
188 }
189 
190 template <>
191 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
192 {
193  const uint32x4_t r1 = UnpackLow32(a, b);
194  const uint32x4_t r2 = UnpackLow32(c, d);
195  return UnpackHigh64(r1, r2);
196 }
197 
198 template <>
199 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
200 {
201  const uint32x4_t r1 = UnpackHigh32(a, b);
202  const uint32x4_t r2 = UnpackHigh32(c, d);
203  return UnpackLow64(r1, r2);
204 }
205 
206 template <>
207 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
208 {
209  const uint32x4_t r1 = UnpackHigh32(a, b);
210  const uint32x4_t r2 = UnpackHigh32(c, d);
211  return UnpackHigh64(r1, r2);
212 }
213 
214 template <unsigned int IDX>
215 inline uint32x4_t UnpackNEON(const uint32x4_t& v)
216 {
217  // Should not be instantiated
218  CRYPTOPP_ASSERT(0);;
219  return vmovq_n_u32(0);
220 }
221 
222 template <>
223 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
224 {
225  // Splat to all lanes
226  return vdupq_n_u32(vgetq_lane_u32(v, 0));
227 }
228 
229 template <>
230 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
231 {
232  // Splat to all lanes
233  return vdupq_n_u32(vgetq_lane_u32(v, 1));
234 }
235 
236 template <>
237 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
238 {
239  // Splat to all lanes
240  return vdupq_n_u32(vgetq_lane_u32(v, 2));
241 }
242 
243 template <>
244 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
245 {
246  // Splat to all lanes
247  return vdupq_n_u32(vgetq_lane_u32(v, 3));
248 }
249 
250 template <unsigned int IDX>
251 inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
252 {
253  return UnpackNEON<IDX>(a, b, c, d);
254 }
255 
256 template <unsigned int IDX>
257 inline uint32x4_t RepackNEON(const uint32x4_t& v)
258 {
259  return UnpackNEON<IDX>(v);
260 }
261 
262 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
263 
264 // *************************** IA-32 ***************************//
265 
266 #if (CRYPTOPP_SSSE3_AVAILABLE)
267 
268 inline __m128i Xor(const __m128i& a, const __m128i& b)
269 {
270  return _mm_xor_si128(a, b);
271 }
272 
273 inline __m128i Add(const __m128i& a, const __m128i& b)
274 {
275  return _mm_add_epi32(a, b);
276 }
277 
278 inline __m128i Sub(const __m128i& a, const __m128i& b)
279 {
280  return _mm_sub_epi32(a, b);
281 }
282 
283 template <unsigned int R>
284 inline __m128i RotateLeft(const __m128i& val)
285 {
286 #if defined(__XOP__)
287  return _mm_roti_epi32(val, R);
288 #else
289  return _mm_or_si128(
290  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
291 #endif
292 }
293 
294 template <unsigned int R>
295 inline __m128i RotateRight(const __m128i& val)
296 {
297 #if defined(__XOP__)
298  return _mm_roti_epi32(val, 32-R);
299 #else
300  return _mm_or_si128(
301  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
302 #endif
303 }
304 
305 // Faster than two Shifts and an Or.
306 template <>
307 inline __m128i RotateLeft<8>(const __m128i& val)
308 {
309 #if defined(__XOP__)
310  return _mm_roti_epi32(val, 8);
311 #else
312  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
313  return _mm_shuffle_epi8(val, mask);
314 #endif
315 }
316 
317 // Faster than two Shifts and an Or.
318 template <>
319 inline __m128i RotateRight<8>(const __m128i& val)
320 {
321 #if defined(__XOP__)
322  return _mm_roti_epi32(val, 32-8);
323 #else
324  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
325  return _mm_shuffle_epi8(val, mask);
326 #endif
327 }
328 
329 template <unsigned int IDX>
330 inline __m128i LoadKey(const word32 rkey[])
331 {
332  float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
333  return _mm_castps_si128(_mm_load_ps1(&rk));
334 }
335 
336 template <unsigned int IDX>
337 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
338 {
339  // Should not be instantiated
340  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
341  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
342  CRYPTOPP_ASSERT(0);
343  return _mm_setzero_si128();
344 }
345 
346 template <>
347 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
348 {
349  // LEA is little-endian oriented, so there is no need for a separate shuffle.
350  const __m128i r1 = _mm_unpacklo_epi32(a, b);
351  const __m128i r2 = _mm_unpacklo_epi32(c, d);
352  return _mm_unpacklo_epi64(r1, r2);
353 }
354 
355 template <>
356 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
357 {
358  // LEA is little-endian oriented, so there is no need for a separate shuffle.
359  const __m128i r1 = _mm_unpacklo_epi32(a, b);
360  const __m128i r2 = _mm_unpacklo_epi32(c, d);
361  return _mm_unpackhi_epi64(r1, r2);
362 }
363 
364 template <>
365 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
366 {
367  // LEA is little-endian oriented, so there is no need for a separate shuffle.
368  const __m128i r1 = _mm_unpackhi_epi32(a, b);
369  const __m128i r2 = _mm_unpackhi_epi32(c, d);
370  return _mm_unpacklo_epi64(r1, r2);
371 }
372 
373 template <>
374 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
375 {
376  // LEA is little-endian oriented, so there is no need for a separate shuffle.
377  const __m128i r1 = _mm_unpackhi_epi32(a, b);
378  const __m128i r2 = _mm_unpackhi_epi32(c, d);
379  return _mm_unpackhi_epi64(r1, r2);
380 }
381 
382 template <unsigned int IDX>
383 inline __m128i UnpackXMM(const __m128i& v)
384 {
385  // Should not be instantiated
386  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
387  return _mm_setzero_si128();
388 }
389 
390 template <>
391 inline __m128i UnpackXMM<0>(const __m128i& v)
392 {
393  // Splat to all lanes
394  return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
395 }
396 
397 template <>
398 inline __m128i UnpackXMM<1>(const __m128i& v)
399 {
400  // Splat to all lanes
401  return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
402 }
403 
404 template <>
405 inline __m128i UnpackXMM<2>(const __m128i& v)
406 {
407  // Splat to all lanes
408  return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
409 }
410 
411 template <>
412 inline __m128i UnpackXMM<3>(const __m128i& v)
413 {
414  // Splat to all lanes
415  return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
416 }
417 
418 template <unsigned int IDX>
419 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
420 {
421  return UnpackXMM<IDX>(a, b, c, d);
422 }
423 
424 template <unsigned int IDX>
425 inline __m128i RepackXMM(const __m128i& v)
426 {
427  return UnpackXMM<IDX>(v);
428 }
429 
430 #endif // CRYPTOPP_SSSE3_AVAILABLE
431 
432 // *************************** Power8 ***************************//
433 
434 #if (CRYPTOPP_POWER8_AVAILABLE)
435 
439 
440 inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
441 {
442  return VecXor(a, b);
443 }
444 
445 inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
446 {
447  return VecAdd(a, b);
448 }
449 
450 inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
451 {
452  return VecSub(a, b);
453 }
454 
455 template <unsigned int R>
456 inline uint32x4_p RotateLeft(const uint32x4_p& val)
457 {
458  const uint32x4_p m = {R, R, R, R};
459  return vec_rl(val, m);
460 }
461 
462 template <unsigned int R>
463 inline uint32x4_p RotateRight(const uint32x4_p& val)
464 {
465  const uint32x4_p m = {32-R, 32-R, 32-R, 32-R};
466  return vec_rl(val, m);
467 }
468 
469 template <unsigned int IDX>
470 inline uint32x4_p LoadKey(const word32 rkey[])
471 {
472  return vec_splats(rkey[IDX]);
473 }
474 
475 template <unsigned int IDX>
476 inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
477 {
478  // Should not be instantiated
479  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
480  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
481  CRYPTOPP_ASSERT(0);
482  return VecXor(a, a);
483 }
484 
485 template <>
486 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
487 {
488  const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
489  const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
490  return (uint32x4_p)vec_mergel(r1, r2);
491 }
492 
493 template <>
494 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
495 {
496  const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
497  const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
498  return (uint32x4_p)vec_mergeh(r1, r2);
499 }
500 
501 template <>
502 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
503 {
504  const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
505  const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
506  return (uint32x4_p)vec_mergel(r1, r2);
507 }
508 
509 template <>
510 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
511 {
512  const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
513  const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
514  return (uint32x4_p)vec_mergeh(r1, r2);
515 }
516 
517 template <unsigned int IDX>
518 inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
519 {
520  // Should not be instantiated
521  CRYPTOPP_ASSERT(0);
522  return VecXor(v, v);
523 }
524 
525 template <>
526 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
527 {
528  // Splat to all lanes
529  const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
530  return (uint32x4_p)VecPermute(v, v, m);
531 }
532 
533 template <>
534 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
535 {
536  // Splat to all lanes
537  const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
538  return (uint32x4_p)VecPermute(v, v, m);
539 }
540 
541 template <>
542 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
543 {
544  // Splat to all lanes
545  const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
546  return (uint32x4_p)VecPermute(v, v, m);
547 }
548 
549 template <>
550 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
551 {
552  // Splat to all lanes
553  const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
554  return (uint32x4_p)VecPermute(v, v, m);
555 }
556 
557 template <unsigned int IDX>
558 inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
559 {
560  return UnpackSIMD<IDX>(a, b, c, d);
561 }
562 
563 template <unsigned int IDX>
564 inline uint32x4_p RepackSIMD(const uint32x4_p& v)
565 {
566  return UnpackSIMD<IDX>(v);
567 }
568 
569 #endif // CRYPTOPP_POWER8_AVAILABLE
570 
571 // *************************** LEA Encryption ***************************//
572 
573 #if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE)
574 
575 template <class W>
576 inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds)
577 {
578  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
579  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
580  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
581  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
582  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
583  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
584  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
585  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
586  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
587  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
588  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
589  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
590 
591  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
592  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
593  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
594  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
595  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
596  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
597  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
598  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
599  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
600  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
601  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
602  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
603 
604  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
605  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
606  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
607  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
608  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
609  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
610  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
611  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
612  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
613  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
614  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
615  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
616 
617  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
618  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
619  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
620  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
621  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
622  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
623  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
624  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
625  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
626  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
627  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
628  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
629 
630  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
631  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
632  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
633  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
634  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
635  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
636  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
637  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
638  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
639  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
640  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
641  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
642 
643  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
644  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
645  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
646  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
647  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
648  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
649  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
650  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
651  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
652  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
653  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
654  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
655 
656  if(rounds > 24)
657  {
658  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
659  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
660  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
661  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
662  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
663  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
664  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
665  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
666  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
667  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
668  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
669  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
670  }
671 
672  if(rounds > 28)
673  {
674  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
675  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
676  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
677  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
678  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
679  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
680  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
681  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
682  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
683  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
684  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
685  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
686  }
687 }
688 
689 // *************************** LEA Decryption ***************************//
690 
691 template <class W>
692 inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds)
693 {
694  if(rounds > 28)
695  {
696  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
697  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
698  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
699  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
700  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
701  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
702  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
703  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
704  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
705  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
706  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
707  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
708  }
709 
710  if(rounds > 24)
711  {
712  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
713  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
714  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
715  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
716  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
717  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
718  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
719  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
720  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
721  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
722  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
723  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
724  }
725 
726  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
727  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
728  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
729  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
730  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
731  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
732  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
733  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
734  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
735  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
736  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
737  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
738 
739  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
740  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
741  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
742  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
743  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
744  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
745  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
746  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
747  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
748  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
749  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
750  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
751 
752  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
753  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
754  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
755  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
756  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
757  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
758  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
759  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
760  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
761  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
762  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
763  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
764 
765  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
766  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
767  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
768  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
769  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
770  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
771  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
772  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
773  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
774  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
775  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
776  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
777 
778  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
779  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
780  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
781  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
782  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
783  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
784  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
785  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
786  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
787  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
788  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
789  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
790 
791  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
792  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
793  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
794  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
795  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
796  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
797  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
798  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
799  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
800  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
801  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
802  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
803 }
804 
805 #endif // LEA Encryption and Decryption
806 
807 // *************************** ARM NEON ***************************//
808 
809 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
810 
811 inline void LEA_Enc_Block(uint32x4_t &block0,
812  const word32 *subkeys, unsigned int rounds)
813 {
814  uint32x4_t temp[4];
815  temp[0] = UnpackNEON<0>(block0);
816  temp[1] = UnpackNEON<1>(block0);
817  temp[2] = UnpackNEON<2>(block0);
818  temp[3] = UnpackNEON<3>(block0);
819 
820  LEA_Encryption(temp, subkeys, rounds);
821 
822  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
823 }
824 
825 inline void LEA_Dec_Block(uint32x4_t &block0,
826  const word32 *subkeys, unsigned int rounds)
827 {
828  uint32x4_t temp[4];
829  temp[0] = UnpackNEON<0>(block0);
830  temp[1] = UnpackNEON<1>(block0);
831  temp[2] = UnpackNEON<2>(block0);
832  temp[3] = UnpackNEON<3>(block0);
833 
834  LEA_Decryption(temp, subkeys, rounds);
835 
836  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
837 }
838 
839 inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
840  uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
841 {
842  uint32x4_t temp[4];
843  temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
844  temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
845  temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
846  temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
847 
848  LEA_Encryption(temp, subkeys, rounds);
849 
850  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
851  block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
852  block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
853  block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
854 }
855 
856 inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
857  uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
858 {
859  uint32x4_t temp[4];
860  temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
861  temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
862  temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
863  temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
864 
865  LEA_Decryption(temp, subkeys, rounds);
866 
867  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
868  block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
869  block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
870  block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
871 }
872 
873 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
874 
875 // *************************** IA-32 ***************************//
876 
877 #if (CRYPTOPP_SSSE3_AVAILABLE)
878 
879 inline void LEA_Enc_Block(__m128i &block0,
880  const word32 *subkeys, unsigned int rounds)
881 {
882  __m128i temp[4];
883  temp[0] = UnpackXMM<0>(block0);
884  temp[1] = UnpackXMM<1>(block0);
885  temp[2] = UnpackXMM<2>(block0);
886  temp[3] = UnpackXMM<3>(block0);
887 
888  LEA_Encryption(temp, subkeys, rounds);
889 
890  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
891 }
892 
893 inline void LEA_Dec_Block(__m128i &block0,
894  const word32 *subkeys, unsigned int rounds)
895 {
896  __m128i temp[4];
897  temp[0] = UnpackXMM<0>(block0);
898  temp[1] = UnpackXMM<1>(block0);
899  temp[2] = UnpackXMM<2>(block0);
900  temp[3] = UnpackXMM<3>(block0);
901 
902  LEA_Decryption(temp, subkeys, rounds);
903 
904  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
905 }
906 
907 inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
908  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
909 {
910  __m128i temp[4];
911  temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
912  temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
913  temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
914  temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
915 
916  LEA_Encryption(temp, subkeys, rounds);
917 
918  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
919  block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
920  block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
921  block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
922 }
923 
924 inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
925  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
926 {
927  __m128i temp[4];
928  temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
929  temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
930  temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
931  temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
932 
933  LEA_Decryption(temp, subkeys, rounds);
934 
935  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
936  block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
937  block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
938  block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
939 }
940 
941 #endif // CRYPTOPP_SSSE3_AVAILABLE
942 
943 // *************************** Power8 ***************************//
944 
945 #if (CRYPTOPP_POWER8_AVAILABLE)
946 
947 inline void LEA_Enc_Block(uint32x4_p &block0,
948  const word32 *subkeys, unsigned int rounds)
949 {
950  uint32x4_p temp[4];
951  temp[0] = UnpackSIMD<0>(block0);
952  temp[1] = UnpackSIMD<1>(block0);
953  temp[2] = UnpackSIMD<2>(block0);
954  temp[3] = UnpackSIMD<3>(block0);
955 
956  LEA_Encryption(temp, subkeys, rounds);
957 
958  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
959 }
960 
961 inline void LEA_Dec_Block(uint32x4_p &block0,
962  const word32 *subkeys, unsigned int rounds)
963 {
964  uint32x4_p temp[4];
965  temp[0] = UnpackSIMD<0>(block0);
966  temp[1] = UnpackSIMD<1>(block0);
967  temp[2] = UnpackSIMD<2>(block0);
968  temp[3] = UnpackSIMD<3>(block0);
969 
970  LEA_Decryption(temp, subkeys, rounds);
971 
972  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
973 }
974 
975 inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
976  uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
977 {
978  uint32x4_p temp[4];
979  temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
980  temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
981  temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
982  temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
983 
984  LEA_Encryption(temp, subkeys, rounds);
985 
986  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
987  block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
988  block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
989  block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
990 }
991 
992 inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
993  uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
994 {
995  uint32x4_p temp[4];
996  temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
997  temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
998  temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
999  temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
1000 
1001  LEA_Decryption(temp, subkeys, rounds);
1002 
1003  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
1004  block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1005  block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1006  block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1007 }
1008 
1009 #endif // CRYPTOPP_POWER8_AVAILABLE
1010 
1011 ANONYMOUS_NAMESPACE_END
1012 
1013 // *************************** SIMD Templates ***************************//
1014 
1015 NAMESPACE_BEGIN(CryptoPP)
1016 
1017 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1018 size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1019  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1020 {
1021  return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
1022  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1023 }
1024 
1025 size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1026  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1027 {
1028  return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
1029  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1030 }
1031 #endif // CRYPTOPP_SSSE3_AVAILABLE
1032 
1033 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
1034 size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1035  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1036 {
1037  uint32x4_t unused; // Avoid template argument deduction/substitution failures
1038  return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
1039  unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1040 }
1041 
1042 size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1043  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1044 {
1045  uint32x4_t unused; // Avoid template argument deduction/substitution failures
1046  return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
1047  unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1048 }
1049 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
1050 
1051 #if defined(CRYPTOPP_POWER8_AVAILABLE)
1052 size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1053  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1054 {
1055  return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks,
1056  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1057 }
1058 
1059 size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1060  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1061 {
1062  return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks,
1063  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1064 }
1065 #endif // CRYPTOPP_POWER8_AVAILABLE
1066 
1067 NAMESPACE_END
Utility functions for the Crypto++ library.
Classes for the LEA block cipher.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:980
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:138
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118