20 #if defined(CRYPTOPP_SSE2_AVAILABLE) 21 # define CRYPTOPP_AVX512_ROTATE 1 22 # include <xmmintrin.h> 23 # include <emmintrin.h> 26 #if (CRYPTOPP_SSSE3_AVAILABLE) 27 # include <pmmintrin.h> 28 # include <tmmintrin.h> 32 # include <ammintrin.h> 36 extern const char CHAM_SIMD_FNAME[] = __FILE__;
38 ANONYMOUS_NAMESPACE_BEGIN
40 using CryptoPP::word16;
41 using CryptoPP::word32;
43 #if (CRYPTOPP_SSSE3_AVAILABLE) 49 template <
unsigned int R>
50 inline __m128i RotateLeft16(
const __m128i& val)
53 return _mm_roti_epi16(val, R);
56 _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
60 template <
unsigned int R>
61 inline __m128i RotateRight16(
const __m128i& val)
64 return _mm_roti_epi16(val, 16-R);
67 _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
73 inline __m128i RotateLeft16<8>(
const __m128i& val)
76 return _mm_roti_epi16(val, 8);
78 const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
79 return _mm_shuffle_epi8(val, mask);
85 inline __m128i RotateRight16<8>(
const __m128i& val)
88 return _mm_roti_epi16(val, 16-8);
90 const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
91 return _mm_shuffle_epi8(val, mask);
95 template <
unsigned int IDX>
96 inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
97 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
100 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
101 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
102 CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
103 CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
105 return _mm_setzero_si128();
109 inline __m128i UnpackXMM<0>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
110 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
116 const __m128i r1 = _mm_unpacklo_epi16(a, b);
117 const __m128i r2 = _mm_unpacklo_epi16(c, d);
118 const __m128i r3 = _mm_unpacklo_epi16(e, f);
119 const __m128i r4 = _mm_unpacklo_epi16(g, h);
121 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
122 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
123 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
124 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
128 inline __m128i UnpackXMM<1>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
129 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
135 const __m128i r1 = _mm_unpacklo_epi16(a, b);
136 const __m128i r2 = _mm_unpacklo_epi16(c, d);
137 const __m128i r3 = _mm_unpacklo_epi16(e, f);
138 const __m128i r4 = _mm_unpacklo_epi16(g, h);
140 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
141 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
142 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
143 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
147 inline __m128i UnpackXMM<2>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
148 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
154 const __m128i r1 = _mm_unpacklo_epi16(a, b);
155 const __m128i r2 = _mm_unpacklo_epi16(c, d);
156 const __m128i r3 = _mm_unpacklo_epi16(e, f);
157 const __m128i r4 = _mm_unpacklo_epi16(g, h);
159 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
160 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
161 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
162 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
166 inline __m128i UnpackXMM<3>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
167 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
173 const __m128i r1 = _mm_unpacklo_epi16(a, b);
174 const __m128i r2 = _mm_unpacklo_epi16(c, d);
175 const __m128i r3 = _mm_unpacklo_epi16(e, f);
176 const __m128i r4 = _mm_unpacklo_epi16(g, h);
178 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
179 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
180 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
181 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
185 inline __m128i UnpackXMM<4>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
186 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
192 const __m128i r1 = _mm_unpackhi_epi16(a, b);
193 const __m128i r2 = _mm_unpackhi_epi16(c, d);
194 const __m128i r3 = _mm_unpackhi_epi16(e, f);
195 const __m128i r4 = _mm_unpackhi_epi16(g, h);
197 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
198 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
199 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
200 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
204 inline __m128i UnpackXMM<5>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
205 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
211 const __m128i r1 = _mm_unpackhi_epi16(a, b);
212 const __m128i r2 = _mm_unpackhi_epi16(c, d);
213 const __m128i r3 = _mm_unpackhi_epi16(e, f);
214 const __m128i r4 = _mm_unpackhi_epi16(g, h);
216 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
217 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
218 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
219 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
223 inline __m128i UnpackXMM<6>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
224 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
230 const __m128i r1 = _mm_unpackhi_epi16(a, b);
231 const __m128i r2 = _mm_unpackhi_epi16(c, d);
232 const __m128i r3 = _mm_unpackhi_epi16(e, f);
233 const __m128i r4 = _mm_unpackhi_epi16(g, h);
235 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
236 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
237 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
238 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
242 inline __m128i UnpackXMM<7>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
243 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
249 const __m128i r1 = _mm_unpackhi_epi16(a, b);
250 const __m128i r2 = _mm_unpackhi_epi16(c, d);
251 const __m128i r3 = _mm_unpackhi_epi16(e, f);
252 const __m128i r4 = _mm_unpackhi_epi16(g, h);
254 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
255 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
256 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
257 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
260 template <
unsigned int IDX>
261 inline __m128i UnpackXMM(
const __m128i& v)
266 return _mm_setzero_si128();
270 inline __m128i UnpackXMM<0>(
const __m128i& v)
272 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
276 inline __m128i UnpackXMM<1>(
const __m128i& v)
278 return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
282 inline __m128i UnpackXMM<2>(
const __m128i& v)
284 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
288 inline __m128i UnpackXMM<3>(
const __m128i& v)
290 return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
294 inline __m128i UnpackXMM<4>(
const __m128i& v)
296 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
300 inline __m128i UnpackXMM<5>(
const __m128i& v)
302 return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
306 inline __m128i UnpackXMM<6>(
const __m128i& v)
308 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
312 inline __m128i UnpackXMM<7>(
const __m128i& v)
314 return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
317 template <
unsigned int IDX>
318 inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b)
320 const __m128i& z = _mm_setzero_si128();
321 return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
324 template <
unsigned int IDX>
325 inline __m128i RepackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
326 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
328 return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
331 template <
unsigned int IDX>
332 inline __m128i RepackXMM(
const __m128i& v)
334 return UnpackXMM<IDX>(v);
337 inline void CHAM64_Enc_Block(__m128i &block0,
338 const word16 *subkeys,
unsigned int )
344 __m128i a = UnpackXMM<0>(block0);
345 __m128i b = UnpackXMM<1>(block0);
346 __m128i c = UnpackXMM<2>(block0);
347 __m128i d = UnpackXMM<3>(block0);
348 __m128i e = UnpackXMM<4>(block0);
349 __m128i f = UnpackXMM<5>(block0);
350 __m128i g = UnpackXMM<6>(block0);
351 __m128i h = UnpackXMM<7>(block0);
353 const unsigned int rounds = 80;
354 __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
355 __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
357 const unsigned int MASK = 15;
358 for (
int i=0; i<static_cast<int>(rounds); i+=4)
360 __m128i k, kr, t1, t2, t3, t4;
361 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+0) & MASK])));
364 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
366 t1 = _mm_xor_si128(a, counter);
367 t3 = _mm_xor_si128(e, counter);
368 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
369 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
370 a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
371 e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
373 counter = _mm_add_epi16(counter, increment);
374 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
376 t1 = _mm_xor_si128(b, counter);
377 t3 = _mm_xor_si128(f, counter);
378 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
379 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
380 b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
381 f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
383 counter = _mm_add_epi16(counter, increment);
384 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
386 t1 = _mm_xor_si128(c, counter);
387 t3 = _mm_xor_si128(g, counter);
388 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
389 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
390 c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
391 g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
393 counter = _mm_add_epi16(counter, increment);
394 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
396 t1 = _mm_xor_si128(d, counter);
397 t3 = _mm_xor_si128(h, counter);
398 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
399 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
400 d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
401 h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
403 counter = _mm_add_epi16(counter, increment);
407 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
410 inline void CHAM64_Dec_Block(__m128i &block0,
411 const word16 *subkeys,
unsigned int )
417 __m128i a = UnpackXMM<0>(block0);
418 __m128i b = UnpackXMM<1>(block0);
419 __m128i c = UnpackXMM<2>(block0);
420 __m128i d = UnpackXMM<3>(block0);
421 __m128i e = UnpackXMM<4>(block0);
422 __m128i f = UnpackXMM<5>(block0);
423 __m128i g = UnpackXMM<6>(block0);
424 __m128i h = UnpackXMM<7>(block0);
426 const unsigned int rounds = 80;
427 __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
428 __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
430 const unsigned int MASK = 15;
431 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
433 __m128i k, kr, t1, t2, t3, t4;
434 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
437 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
440 t1 = RotateRight16<1>(d);
441 t3 = RotateRight16<1>(h);
442 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
443 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
444 d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
445 h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
447 counter = _mm_sub_epi16(counter, decrement);
448 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
451 t1 = RotateRight16<8>(c);
452 t3 = RotateRight16<8>(g);
453 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
454 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
455 c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
456 g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
458 counter = _mm_sub_epi16(counter, decrement);
459 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
462 t1 = RotateRight16<1>(b);
463 t3 = RotateRight16<1>(f);
464 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
465 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
466 b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
467 f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
469 counter = _mm_sub_epi16(counter, decrement);
470 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
473 t1 = RotateRight16<8>(a);
474 t3 = RotateRight16<8>(e);
475 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
476 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
477 a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
478 e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
480 counter = _mm_sub_epi16(counter, decrement);
484 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
487 inline void CHAM64_Enc_2_Blocks(__m128i &block0,
488 __m128i &block1,
const word16 *subkeys,
unsigned int )
494 __m128i a = UnpackXMM<0>(block0, block1);
495 __m128i b = UnpackXMM<1>(block0, block1);
496 __m128i c = UnpackXMM<2>(block0, block1);
497 __m128i d = UnpackXMM<3>(block0, block1);
498 __m128i e = UnpackXMM<4>(block0, block1);
499 __m128i f = UnpackXMM<5>(block0, block1);
500 __m128i g = UnpackXMM<6>(block0, block1);
501 __m128i h = UnpackXMM<7>(block0, block1);
503 const unsigned int rounds = 80;
504 __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
505 __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
507 const unsigned int MASK = 15;
508 for (
int i=0; i<static_cast<int>(rounds); i+=4)
510 __m128i k, kr, t1, t2, t3, t4;
511 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[i & MASK])));
514 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
516 t1 = _mm_xor_si128(a, counter);
517 t3 = _mm_xor_si128(e, counter);
518 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
519 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
520 a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
521 e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
523 counter = _mm_add_epi16(counter, increment);
524 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
526 t1 = _mm_xor_si128(b, counter);
527 t3 = _mm_xor_si128(f, counter);
528 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
529 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
530 b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
531 f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
533 counter = _mm_add_epi16(counter, increment);
534 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
536 t1 = _mm_xor_si128(c, counter);
537 t3 = _mm_xor_si128(g, counter);
538 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
539 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
540 c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
541 g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
543 counter = _mm_add_epi16(counter, increment);
544 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
546 t1 = _mm_xor_si128(d, counter);
547 t3 = _mm_xor_si128(h, counter);
548 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
549 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
550 d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
551 h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
553 counter = _mm_add_epi16(counter, increment);
557 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
558 block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
561 inline void CHAM64_Dec_2_Blocks(__m128i &block0,
562 __m128i &block1,
const word16 *subkeys,
unsigned int )
568 __m128i a = UnpackXMM<0>(block0, block1);
569 __m128i b = UnpackXMM<1>(block0, block1);
570 __m128i c = UnpackXMM<2>(block0, block1);
571 __m128i d = UnpackXMM<3>(block0, block1);
572 __m128i e = UnpackXMM<4>(block0, block1);
573 __m128i f = UnpackXMM<5>(block0, block1);
574 __m128i g = UnpackXMM<6>(block0, block1);
575 __m128i h = UnpackXMM<7>(block0, block1);
577 const unsigned int rounds = 80;
578 __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
579 __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
581 const unsigned int MASK = 15;
582 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
584 __m128i k, kr, t1, t2, t3, t4;
585 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
588 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
591 t1 = RotateRight16<1>(d);
592 t3 = RotateRight16<1>(h);
593 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
594 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
595 d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
596 h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
598 counter = _mm_sub_epi16(counter, decrement);
599 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
602 t1 = RotateRight16<8>(c);
603 t3 = RotateRight16<8>(g);
604 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
605 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
606 c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
607 g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
609 counter = _mm_sub_epi16(counter, decrement);
610 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
613 t1 = RotateRight16<1>(b);
614 t3 = RotateRight16<1>(f);
615 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
616 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
617 b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
618 f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
620 counter = _mm_sub_epi16(counter, decrement);
621 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
624 t1 = RotateRight16<8>(a);
625 t3 = RotateRight16<8>(e);
626 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
627 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
628 a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
629 e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
631 counter = _mm_sub_epi16(counter, decrement);
635 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
636 block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
645 template <
unsigned int R>
646 inline __m128i RotateLeft32(
const __m128i& val)
648 #if defined(CRYPTOPP_AVX512_ROTATE) 649 return _mm_rol_epi32(val, R);
650 #elif defined(__XOP__) 651 return _mm_roti_epi32(val, R);
654 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
658 template <
unsigned int R>
659 inline __m128i RotateRight32(
const __m128i& val)
661 #if defined(CRYPTOPP_AVX512_ROTATE) 662 return _mm_ror_epi32(val, R);
663 #elif defined(__XOP__) 664 return _mm_roti_epi32(val, 32-R);
667 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
673 inline __m128i RotateLeft32<8>(
const __m128i& val)
676 return _mm_roti_epi32(val, 8);
678 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
679 return _mm_shuffle_epi8(val, mask);
685 inline __m128i RotateRight32<8>(
const __m128i& val)
688 return _mm_roti_epi32(val, 32-8);
690 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
691 return _mm_shuffle_epi8(val, mask);
695 template <
unsigned int IDX>
696 inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
699 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
700 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
702 return _mm_setzero_si128();
706 inline __m128i UnpackXMM<0>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
712 const __m128i r1 = _mm_unpacklo_epi32(a, b);
713 const __m128i r2 = _mm_unpacklo_epi32(c, d);
714 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
715 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
719 inline __m128i UnpackXMM<1>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
725 const __m128i r1 = _mm_unpacklo_epi32(a, b);
726 const __m128i r2 = _mm_unpacklo_epi32(c, d);
727 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
728 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
732 inline __m128i UnpackXMM<2>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
738 const __m128i r1 = _mm_unpackhi_epi32(a, b);
739 const __m128i r2 = _mm_unpackhi_epi32(c, d);
740 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
741 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
745 inline __m128i UnpackXMM<3>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
751 const __m128i r1 = _mm_unpackhi_epi32(a, b);
752 const __m128i r2 = _mm_unpackhi_epi32(c, d);
753 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
754 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
757 template <
unsigned int IDX>
758 inline __m128i UnpackXMM(
const __m128i& v)
762 return _mm_setzero_si128();
766 inline __m128i UnpackXMM<0>(
const __m128i& v)
768 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
772 inline __m128i UnpackXMM<1>(
const __m128i& v)
774 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
778 inline __m128i UnpackXMM<2>(
const __m128i& v)
780 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
784 inline __m128i UnpackXMM<3>(
const __m128i& v)
786 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
789 template <
unsigned int IDX>
790 inline __m128i RepackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
792 return UnpackXMM<IDX>(a, b, c, d);
795 template <
unsigned int IDX>
796 inline __m128i RepackXMM(
const __m128i& v)
798 return UnpackXMM<IDX>(v);
801 inline void CHAM128_Enc_Block(__m128i &block0,
802 const word32 *subkeys,
unsigned int rounds)
808 __m128i a = UnpackXMM<0>(block0);
809 __m128i b = UnpackXMM<1>(block0);
810 __m128i c = UnpackXMM<2>(block0);
811 __m128i d = UnpackXMM<3>(block0);
813 __m128i counter = _mm_set_epi32(0,0,0,0);
814 __m128i increment = _mm_set_epi32(1,1,1,1);
816 const unsigned int MASK = (rounds == 80 ? 7 : 15);
817 for (
int i=0; i<static_cast<int>(rounds); i+=4)
819 __m128i k, k1, k2, t1, t2;
820 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+0) & MASK])));
823 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
824 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
826 t1 = _mm_xor_si128(a, counter);
827 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
828 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
830 counter = _mm_add_epi32(counter, increment);
832 t1 = _mm_xor_si128(b, counter);
833 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
834 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
836 counter = _mm_add_epi32(counter, increment);
838 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+2) & MASK])));
841 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
842 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
844 t1 = _mm_xor_si128(c, counter);
845 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
846 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
848 counter = _mm_add_epi32(counter, increment);
850 t1 = _mm_xor_si128(d, counter);
851 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
852 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
854 counter = _mm_add_epi32(counter, increment);
858 block0 = RepackXMM<0>(a,b,c,d);
861 inline void CHAM128_Dec_Block(__m128i &block0,
862 const word32 *subkeys,
unsigned int rounds)
868 __m128i a = UnpackXMM<0>(block0);
869 __m128i b = UnpackXMM<1>(block0);
870 __m128i c = UnpackXMM<2>(block0);
871 __m128i d = UnpackXMM<3>(block0);
873 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
874 __m128i decrement = _mm_set_epi32(1,1,1,1);
876 const unsigned int MASK = (rounds == 80 ? 7 : 15);
877 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
879 __m128i k, k1, k2, t1, t2;
880 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-1) & MASK])));
883 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
884 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
887 t1 = RotateRight32<1>(d);
888 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
889 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
891 counter = _mm_sub_epi32(counter, decrement);
894 t1 = RotateRight32<8>(c);
895 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
896 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
898 counter = _mm_sub_epi32(counter, decrement);
899 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
902 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
903 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
906 t1 = RotateRight32<1>(b);
907 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
908 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
910 counter = _mm_sub_epi32(counter, decrement);
913 t1 = RotateRight32<8>(a);
914 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
915 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
917 counter = _mm_sub_epi32(counter, decrement);
921 block0 = RepackXMM<0>(a,b,c,d);
924 inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
925 __m128i &block2, __m128i &block3,
const word32 *subkeys,
unsigned int rounds)
931 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
932 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
933 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
934 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
936 __m128i counter = _mm_set_epi32(0,0,0,0);
937 __m128i increment = _mm_set_epi32(1,1,1,1);
939 const unsigned int MASK = (rounds == 80 ? 7 : 15);
940 for (
int i=0; i<static_cast<int>(rounds); i+=4)
942 __m128i k, k1, k2, t1, t2;
943 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+0) & MASK])));
946 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
947 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
949 t1 = _mm_xor_si128(a, counter);
950 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
951 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
953 counter = _mm_add_epi32(counter, increment);
955 t1 = _mm_xor_si128(b, counter);
956 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
957 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
959 counter = _mm_add_epi32(counter, increment);
960 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+2) & MASK])));
963 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
964 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
966 t1 = _mm_xor_si128(c, counter);
967 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
968 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
970 counter = _mm_add_epi32(counter, increment);
972 t1 = _mm_xor_si128(d, counter);
973 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
974 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
976 counter = _mm_add_epi32(counter, increment);
980 block0 = RepackXMM<0>(a,b,c,d);
981 block1 = RepackXMM<1>(a,b,c,d);
982 block2 = RepackXMM<2>(a,b,c,d);
983 block3 = RepackXMM<3>(a,b,c,d);
986 inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
987 __m128i &block2, __m128i &block3,
const word32 *subkeys,
unsigned int rounds)
993 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
994 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
995 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
996 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
998 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
999 __m128i decrement = _mm_set_epi32(1,1,1,1);
1001 const unsigned int MASK = (rounds == 80 ? 7 : 15);
1002 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
1004 __m128i k, k1, k2, t1, t2;
1005 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-1) & MASK])));
1008 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1009 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1012 t1 = RotateRight32<1>(d);
1013 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
1014 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1016 counter = _mm_sub_epi32(counter, decrement);
1019 t1 = RotateRight32<8>(c);
1020 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
1021 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1023 counter = _mm_sub_epi32(counter, decrement);
1024 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
1027 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1028 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1031 t1 = RotateRight32<1>(b);
1032 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
1033 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1035 counter = _mm_sub_epi32(counter, decrement);
1038 t1 = RotateRight32<8>(a);
1039 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
1040 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1042 counter = _mm_sub_epi32(counter, decrement);
1046 block0 = RepackXMM<0>(a,b,c,d);
1047 block1 = RepackXMM<1>(a,b,c,d);
1048 block2 = RepackXMM<2>(a,b,c,d);
1049 block3 = RepackXMM<3>(a,b,c,d);
1056 #endif // CRYPTOPP_SSSE3_AVAILABLE 1058 ANONYMOUS_NAMESPACE_END
1062 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 1063 size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(
const word16* subKeys,
size_t rounds,
1064 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1066 return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
1067 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1070 size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(
const word16* subKeys,
size_t rounds,
1071 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1073 return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
1074 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1077 size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(
const word32* subKeys,
size_t rounds,
1078 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1080 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
1081 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1084 size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(
const word32* subKeys,
size_t rounds,
1085 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1087 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
1088 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1090 #endif // CRYPTOPP_SSSE3_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
Classes for the CHAM block cipher.
Template for AdvancedProcessBlocks and SIMD processing.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Crypto++ library namespace.