33 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) 34 # include <xmmintrin.h> 35 # include <emmintrin.h> 38 #if defined(__SSSE3__) 39 # include <tmmintrin.h> 43 # include <ammintrin.h> 46 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 47 # include <arm_neon.h> 52 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 54 # include <arm_acle.h> 57 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 62 extern const char CHACHA_SIMD_FNAME[] = __FILE__;
64 ANONYMOUS_NAMESPACE_BEGIN
68 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 70 template <
unsigned int R>
71 inline uint32x4_t RotateLeft(
const uint32x4_t& val)
73 return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R));
76 template <
unsigned int R>
77 inline uint32x4_t RotateRight(
const uint32x4_t& val)
79 return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R));
83 inline uint32x4_t RotateLeft<8>(
const uint32x4_t& val)
85 #if defined(__aarch32__) || defined(__aarch64__) 86 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
87 const uint8x16_t mask = vld1q_u8(maskb);
89 return vreinterpretq_u32_u8(
90 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
93 return vorrq_u32(vshlq_n_u32(val, 8),
94 vshrq_n_u32(val, 32 - 8));
99 inline uint32x4_t RotateLeft<16>(
const uint32x4_t& val)
101 #if defined(__aarch32__) || defined(__aarch64__) 102 return vreinterpretq_u32_u16(
103 vrev32q_u16(vreinterpretq_u16_u32(val)));
106 return vorrq_u32(vshlq_n_u32(val, 16),
107 vshrq_n_u32(val, 32 - 16));
112 inline uint32x4_t RotateRight<8>(
const uint32x4_t& val)
114 #if defined(__aarch32__) || defined(__aarch64__) 115 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
116 const uint8x16_t mask = vld1q_u8(maskb);
118 return vreinterpretq_u32_u8(
119 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
122 return vorrq_u32(vshrq_n_u32(val, 8),
123 vshlq_n_u32(val, 32 - 8));
128 inline uint32x4_t RotateRight<16>(
const uint32x4_t& val)
130 #if defined(__aarch32__) || defined(__aarch64__) 131 return vreinterpretq_u32_u16(
132 vrev32q_u16(vreinterpretq_u16_u32(val)));
135 return vorrq_u32(vshrq_n_u32(val, 16),
136 vshlq_n_u32(val, 32 - 16));
145 template <
unsigned int S>
146 inline uint32x4_t Extract(
const uint32x4_t& val)
148 return vextq_u32(val, val, S);
152 inline uint32x4_t Add64(
const uint32x4_t& a,
const uint32x4_t& b)
154 return vreinterpretq_u32_u64(
156 vreinterpretq_u64_u32(a),
157 vreinterpretq_u64_u32(b)));
160 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 164 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) 166 template <
unsigned int R>
167 inline __m128i RotateLeft(
const __m128i val)
170 return _mm_roti_epi32(val, R);
172 return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
177 inline __m128i RotateLeft<8>(
const __m128i val)
180 return _mm_roti_epi32(val, 8);
181 #elif defined(__SSSE3__) 182 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
183 return _mm_shuffle_epi8(val, mask);
185 return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32-8));
190 inline __m128i RotateLeft<16>(
const __m128i val)
193 return _mm_roti_epi32(val, 16);
194 #elif defined(__SSSE3__) 195 const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
196 return _mm_shuffle_epi8(val, mask);
198 return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32-16));
202 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE 206 #if (CRYPTOPP_ALTIVEC_AVAILABLE) 226 inline uint32x4_p VecLoad32LE(
const uint8_t src[16])
228 #if (CRYPTOPP_BIG_ENDIAN) 229 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
240 inline void VecStore32LE(uint8_t dest[16],
const uint32x4_p& val)
242 #if (CRYPTOPP_BIG_ENDIAN) 243 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
255 template <
unsigned int S>
265 const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
272 const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
279 const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
283 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 285 ANONYMOUS_NAMESPACE_END
291 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 293 void ChaCha_OperateKeystream_NEON(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
295 const uint32x4_t state0 = vld1q_u32(state + 0*4);
296 const uint32x4_t state1 = vld1q_u32(state + 1*4);
297 const uint32x4_t state2 = vld1q_u32(state + 2*4);
298 const uint32x4_t state3 = vld1q_u32(state + 3*4);
300 const uint32x4_t CTRS[3] = {
301 {1,0,0,0}, {2,0,0,0}, {3,0,0,0}
304 uint32x4_t r0_0 = state0;
305 uint32x4_t r0_1 = state1;
306 uint32x4_t r0_2 = state2;
307 uint32x4_t r0_3 = state3;
309 uint32x4_t r1_0 = state0;
310 uint32x4_t r1_1 = state1;
311 uint32x4_t r1_2 = state2;
312 uint32x4_t r1_3 = Add64(r0_3, CTRS[0]);
314 uint32x4_t r2_0 = state0;
315 uint32x4_t r2_1 = state1;
316 uint32x4_t r2_2 = state2;
317 uint32x4_t r2_3 = Add64(r0_3, CTRS[1]);
319 uint32x4_t r3_0 = state0;
320 uint32x4_t r3_1 = state1;
321 uint32x4_t r3_2 = state2;
322 uint32x4_t r3_3 = Add64(r0_3, CTRS[2]);
324 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
326 r0_0 = vaddq_u32(r0_0, r0_1);
327 r1_0 = vaddq_u32(r1_0, r1_1);
328 r2_0 = vaddq_u32(r2_0, r2_1);
329 r3_0 = vaddq_u32(r3_0, r3_1);
331 r0_3 = veorq_u32(r0_3, r0_0);
332 r1_3 = veorq_u32(r1_3, r1_0);
333 r2_3 = veorq_u32(r2_3, r2_0);
334 r3_3 = veorq_u32(r3_3, r3_0);
336 r0_3 = RotateLeft<16>(r0_3);
337 r1_3 = RotateLeft<16>(r1_3);
338 r2_3 = RotateLeft<16>(r2_3);
339 r3_3 = RotateLeft<16>(r3_3);
341 r0_2 = vaddq_u32(r0_2, r0_3);
342 r1_2 = vaddq_u32(r1_2, r1_3);
343 r2_2 = vaddq_u32(r2_2, r2_3);
344 r3_2 = vaddq_u32(r3_2, r3_3);
346 r0_1 = veorq_u32(r0_1, r0_2);
347 r1_1 = veorq_u32(r1_1, r1_2);
348 r2_1 = veorq_u32(r2_1, r2_2);
349 r3_1 = veorq_u32(r3_1, r3_2);
351 r0_1 = RotateLeft<12>(r0_1);
352 r1_1 = RotateLeft<12>(r1_1);
353 r2_1 = RotateLeft<12>(r2_1);
354 r3_1 = RotateLeft<12>(r3_1);
356 r0_0 = vaddq_u32(r0_0, r0_1);
357 r1_0 = vaddq_u32(r1_0, r1_1);
358 r2_0 = vaddq_u32(r2_0, r2_1);
359 r3_0 = vaddq_u32(r3_0, r3_1);
361 r0_3 = veorq_u32(r0_3, r0_0);
362 r1_3 = veorq_u32(r1_3, r1_0);
363 r2_3 = veorq_u32(r2_3, r2_0);
364 r3_3 = veorq_u32(r3_3, r3_0);
366 r0_3 = RotateLeft<8>(r0_3);
367 r1_3 = RotateLeft<8>(r1_3);
368 r2_3 = RotateLeft<8>(r2_3);
369 r3_3 = RotateLeft<8>(r3_3);
371 r0_2 = vaddq_u32(r0_2, r0_3);
372 r1_2 = vaddq_u32(r1_2, r1_3);
373 r2_2 = vaddq_u32(r2_2, r2_3);
374 r3_2 = vaddq_u32(r3_2, r3_3);
376 r0_1 = veorq_u32(r0_1, r0_2);
377 r1_1 = veorq_u32(r1_1, r1_2);
378 r2_1 = veorq_u32(r2_1, r2_2);
379 r3_1 = veorq_u32(r3_1, r3_2);
381 r0_1 = RotateLeft<7>(r0_1);
382 r1_1 = RotateLeft<7>(r1_1);
383 r2_1 = RotateLeft<7>(r2_1);
384 r3_1 = RotateLeft<7>(r3_1);
386 r0_1 = Extract<1>(r0_1);
387 r0_2 = Extract<2>(r0_2);
388 r0_3 = Extract<3>(r0_3);
390 r1_1 = Extract<1>(r1_1);
391 r1_2 = Extract<2>(r1_2);
392 r1_3 = Extract<3>(r1_3);
394 r2_1 = Extract<1>(r2_1);
395 r2_2 = Extract<2>(r2_2);
396 r2_3 = Extract<3>(r2_3);
398 r3_1 = Extract<1>(r3_1);
399 r3_2 = Extract<2>(r3_2);
400 r3_3 = Extract<3>(r3_3);
402 r0_0 = vaddq_u32(r0_0, r0_1);
403 r1_0 = vaddq_u32(r1_0, r1_1);
404 r2_0 = vaddq_u32(r2_0, r2_1);
405 r3_0 = vaddq_u32(r3_0, r3_1);
407 r0_3 = veorq_u32(r0_3, r0_0);
408 r1_3 = veorq_u32(r1_3, r1_0);
409 r2_3 = veorq_u32(r2_3, r2_0);
410 r3_3 = veorq_u32(r3_3, r3_0);
412 r0_3 = RotateLeft<16>(r0_3);
413 r1_3 = RotateLeft<16>(r1_3);
414 r2_3 = RotateLeft<16>(r2_3);
415 r3_3 = RotateLeft<16>(r3_3);
417 r0_2 = vaddq_u32(r0_2, r0_3);
418 r1_2 = vaddq_u32(r1_2, r1_3);
419 r2_2 = vaddq_u32(r2_2, r2_3);
420 r3_2 = vaddq_u32(r3_2, r3_3);
422 r0_1 = veorq_u32(r0_1, r0_2);
423 r1_1 = veorq_u32(r1_1, r1_2);
424 r2_1 = veorq_u32(r2_1, r2_2);
425 r3_1 = veorq_u32(r3_1, r3_2);
427 r0_1 = RotateLeft<12>(r0_1);
428 r1_1 = RotateLeft<12>(r1_1);
429 r2_1 = RotateLeft<12>(r2_1);
430 r3_1 = RotateLeft<12>(r3_1);
432 r0_0 = vaddq_u32(r0_0, r0_1);
433 r1_0 = vaddq_u32(r1_0, r1_1);
434 r2_0 = vaddq_u32(r2_0, r2_1);
435 r3_0 = vaddq_u32(r3_0, r3_1);
437 r0_3 = veorq_u32(r0_3, r0_0);
438 r1_3 = veorq_u32(r1_3, r1_0);
439 r2_3 = veorq_u32(r2_3, r2_0);
440 r3_3 = veorq_u32(r3_3, r3_0);
442 r0_3 = RotateLeft<8>(r0_3);
443 r1_3 = RotateLeft<8>(r1_3);
444 r2_3 = RotateLeft<8>(r2_3);
445 r3_3 = RotateLeft<8>(r3_3);
447 r0_2 = vaddq_u32(r0_2, r0_3);
448 r1_2 = vaddq_u32(r1_2, r1_3);
449 r2_2 = vaddq_u32(r2_2, r2_3);
450 r3_2 = vaddq_u32(r3_2, r3_3);
452 r0_1 = veorq_u32(r0_1, r0_2);
453 r1_1 = veorq_u32(r1_1, r1_2);
454 r2_1 = veorq_u32(r2_1, r2_2);
455 r3_1 = veorq_u32(r3_1, r3_2);
457 r0_1 = RotateLeft<7>(r0_1);
458 r1_1 = RotateLeft<7>(r1_1);
459 r2_1 = RotateLeft<7>(r2_1);
460 r3_1 = RotateLeft<7>(r3_1);
462 r0_1 = Extract<3>(r0_1);
463 r0_2 = Extract<2>(r0_2);
464 r0_3 = Extract<1>(r0_3);
466 r1_1 = Extract<3>(r1_1);
467 r1_2 = Extract<2>(r1_2);
468 r1_3 = Extract<1>(r1_3);
470 r2_1 = Extract<3>(r2_1);
471 r2_2 = Extract<2>(r2_2);
472 r2_3 = Extract<1>(r2_3);
474 r3_1 = Extract<3>(r3_1);
475 r3_2 = Extract<2>(r3_2);
476 r3_3 = Extract<1>(r3_3);
479 r0_0 = vaddq_u32(r0_0, state0);
480 r0_1 = vaddq_u32(r0_1, state1);
481 r0_2 = vaddq_u32(r0_2, state2);
482 r0_3 = vaddq_u32(r0_3, state3);
484 r1_0 = vaddq_u32(r1_0, state0);
485 r1_1 = vaddq_u32(r1_1, state1);
486 r1_2 = vaddq_u32(r1_2, state2);
487 r1_3 = vaddq_u32(r1_3, state3);
488 r1_3 = Add64(r1_3, CTRS[0]);
490 r2_0 = vaddq_u32(r2_0, state0);
491 r2_1 = vaddq_u32(r2_1, state1);
492 r2_2 = vaddq_u32(r2_2, state2);
493 r2_3 = vaddq_u32(r2_3, state3);
494 r2_3 = Add64(r2_3, CTRS[1]);
496 r3_0 = vaddq_u32(r3_0, state0);
497 r3_1 = vaddq_u32(r3_1, state1);
498 r3_2 = vaddq_u32(r3_2, state2);
499 r3_3 = vaddq_u32(r3_3, state3);
500 r3_3 = Add64(r3_3, CTRS[2]);
504 r0_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 0*16)), r0_0);
505 r0_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 1*16)), r0_1);
506 r0_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 2*16)), r0_2);
507 r0_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 3*16)), r0_3);
510 vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0));
511 vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1));
512 vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2));
513 vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3));
517 r1_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 4*16)), r1_0);
518 r1_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 5*16)), r1_1);
519 r1_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 6*16)), r1_2);
520 r1_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 7*16)), r1_3);
523 vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0));
524 vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1));
525 vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2));
526 vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3));
530 r2_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 8*16)), r2_0);
531 r2_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 9*16)), r2_1);
532 r2_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 10*16)), r2_2);
533 r2_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 11*16)), r2_3);
536 vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0));
537 vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1));
538 vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2));
539 vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3));
543 r3_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 12*16)), r3_0);
544 r3_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 13*16)), r3_1);
545 r3_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 14*16)), r3_2);
546 r3_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 15*16)), r3_3);
549 vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0));
550 vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1));
551 vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2));
552 vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3));
555 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 559 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) 561 void ChaCha_OperateKeystream_SSE2(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
563 const __m128i* state_mm =
reinterpret_cast<const __m128i*
>(state);
564 const __m128i* input_mm =
reinterpret_cast<const __m128i*
>(input);
565 __m128i* output_mm =
reinterpret_cast<__m128i*
>(output);
567 const __m128i state0 = _mm_load_si128(state_mm + 0);
568 const __m128i state1 = _mm_load_si128(state_mm + 1);
569 const __m128i state2 = _mm_load_si128(state_mm + 2);
570 const __m128i state3 = _mm_load_si128(state_mm + 3);
572 __m128i r0_0 = state0;
573 __m128i r0_1 = state1;
574 __m128i r0_2 = state2;
575 __m128i r0_3 = state3;
577 __m128i r1_0 = state0;
578 __m128i r1_1 = state1;
579 __m128i r1_2 = state2;
580 __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
582 __m128i r2_0 = state0;
583 __m128i r2_1 = state1;
584 __m128i r2_2 = state2;
585 __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
587 __m128i r3_0 = state0;
588 __m128i r3_1 = state1;
589 __m128i r3_2 = state2;
590 __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
592 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
594 r0_0 = _mm_add_epi32(r0_0, r0_1);
595 r1_0 = _mm_add_epi32(r1_0, r1_1);
596 r2_0 = _mm_add_epi32(r2_0, r2_1);
597 r3_0 = _mm_add_epi32(r3_0, r3_1);
599 r0_3 = _mm_xor_si128(r0_3, r0_0);
600 r1_3 = _mm_xor_si128(r1_3, r1_0);
601 r2_3 = _mm_xor_si128(r2_3, r2_0);
602 r3_3 = _mm_xor_si128(r3_3, r3_0);
604 r0_3 = RotateLeft<16>(r0_3);
605 r1_3 = RotateLeft<16>(r1_3);
606 r2_3 = RotateLeft<16>(r2_3);
607 r3_3 = RotateLeft<16>(r3_3);
609 r0_2 = _mm_add_epi32(r0_2, r0_3);
610 r1_2 = _mm_add_epi32(r1_2, r1_3);
611 r2_2 = _mm_add_epi32(r2_2, r2_3);
612 r3_2 = _mm_add_epi32(r3_2, r3_3);
614 r0_1 = _mm_xor_si128(r0_1, r0_2);
615 r1_1 = _mm_xor_si128(r1_1, r1_2);
616 r2_1 = _mm_xor_si128(r2_1, r2_2);
617 r3_1 = _mm_xor_si128(r3_1, r3_2);
619 r0_1 = RotateLeft<12>(r0_1);
620 r1_1 = RotateLeft<12>(r1_1);
621 r2_1 = RotateLeft<12>(r2_1);
622 r3_1 = RotateLeft<12>(r3_1);
624 r0_0 = _mm_add_epi32(r0_0, r0_1);
625 r1_0 = _mm_add_epi32(r1_0, r1_1);
626 r2_0 = _mm_add_epi32(r2_0, r2_1);
627 r3_0 = _mm_add_epi32(r3_0, r3_1);
629 r0_3 = _mm_xor_si128(r0_3, r0_0);
630 r1_3 = _mm_xor_si128(r1_3, r1_0);
631 r2_3 = _mm_xor_si128(r2_3, r2_0);
632 r3_3 = _mm_xor_si128(r3_3, r3_0);
634 r0_3 = RotateLeft<8>(r0_3);
635 r1_3 = RotateLeft<8>(r1_3);
636 r2_3 = RotateLeft<8>(r2_3);
637 r3_3 = RotateLeft<8>(r3_3);
639 r0_2 = _mm_add_epi32(r0_2, r0_3);
640 r1_2 = _mm_add_epi32(r1_2, r1_3);
641 r2_2 = _mm_add_epi32(r2_2, r2_3);
642 r3_2 = _mm_add_epi32(r3_2, r3_3);
644 r0_1 = _mm_xor_si128(r0_1, r0_2);
645 r1_1 = _mm_xor_si128(r1_1, r1_2);
646 r2_1 = _mm_xor_si128(r2_1, r2_2);
647 r3_1 = _mm_xor_si128(r3_1, r3_2);
649 r0_1 = RotateLeft<7>(r0_1);
650 r1_1 = RotateLeft<7>(r1_1);
651 r2_1 = RotateLeft<7>(r2_1);
652 r3_1 = RotateLeft<7>(r3_1);
654 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
655 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
656 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
658 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
659 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
660 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
662 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
663 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
664 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
666 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
667 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
668 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
670 r0_0 = _mm_add_epi32(r0_0, r0_1);
671 r1_0 = _mm_add_epi32(r1_0, r1_1);
672 r2_0 = _mm_add_epi32(r2_0, r2_1);
673 r3_0 = _mm_add_epi32(r3_0, r3_1);
675 r0_3 = _mm_xor_si128(r0_3, r0_0);
676 r1_3 = _mm_xor_si128(r1_3, r1_0);
677 r2_3 = _mm_xor_si128(r2_3, r2_0);
678 r3_3 = _mm_xor_si128(r3_3, r3_0);
680 r0_3 = RotateLeft<16>(r0_3);
681 r1_3 = RotateLeft<16>(r1_3);
682 r2_3 = RotateLeft<16>(r2_3);
683 r3_3 = RotateLeft<16>(r3_3);
685 r0_2 = _mm_add_epi32(r0_2, r0_3);
686 r1_2 = _mm_add_epi32(r1_2, r1_3);
687 r2_2 = _mm_add_epi32(r2_2, r2_3);
688 r3_2 = _mm_add_epi32(r3_2, r3_3);
690 r0_1 = _mm_xor_si128(r0_1, r0_2);
691 r1_1 = _mm_xor_si128(r1_1, r1_2);
692 r2_1 = _mm_xor_si128(r2_1, r2_2);
693 r3_1 = _mm_xor_si128(r3_1, r3_2);
695 r0_1 = RotateLeft<12>(r0_1);
696 r1_1 = RotateLeft<12>(r1_1);
697 r2_1 = RotateLeft<12>(r2_1);
698 r3_1 = RotateLeft<12>(r3_1);
700 r0_0 = _mm_add_epi32(r0_0, r0_1);
701 r1_0 = _mm_add_epi32(r1_0, r1_1);
702 r2_0 = _mm_add_epi32(r2_0, r2_1);
703 r3_0 = _mm_add_epi32(r3_0, r3_1);
705 r0_3 = _mm_xor_si128(r0_3, r0_0);
706 r1_3 = _mm_xor_si128(r1_3, r1_0);
707 r2_3 = _mm_xor_si128(r2_3, r2_0);
708 r3_3 = _mm_xor_si128(r3_3, r3_0);
710 r0_3 = RotateLeft<8>(r0_3);
711 r1_3 = RotateLeft<8>(r1_3);
712 r2_3 = RotateLeft<8>(r2_3);
713 r3_3 = RotateLeft<8>(r3_3);
715 r0_2 = _mm_add_epi32(r0_2, r0_3);
716 r1_2 = _mm_add_epi32(r1_2, r1_3);
717 r2_2 = _mm_add_epi32(r2_2, r2_3);
718 r3_2 = _mm_add_epi32(r3_2, r3_3);
720 r0_1 = _mm_xor_si128(r0_1, r0_2);
721 r1_1 = _mm_xor_si128(r1_1, r1_2);
722 r2_1 = _mm_xor_si128(r2_1, r2_2);
723 r3_1 = _mm_xor_si128(r3_1, r3_2);
725 r0_1 = RotateLeft<7>(r0_1);
726 r1_1 = RotateLeft<7>(r1_1);
727 r2_1 = RotateLeft<7>(r2_1);
728 r3_1 = RotateLeft<7>(r3_1);
730 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
731 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
732 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
734 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
735 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
736 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
738 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
739 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
740 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
742 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
743 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
744 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
747 r0_0 = _mm_add_epi32(r0_0, state0);
748 r0_1 = _mm_add_epi32(r0_1, state1);
749 r0_2 = _mm_add_epi32(r0_2, state2);
750 r0_3 = _mm_add_epi32(r0_3, state3);
752 r1_0 = _mm_add_epi32(r1_0, state0);
753 r1_1 = _mm_add_epi32(r1_1, state1);
754 r1_2 = _mm_add_epi32(r1_2, state2);
755 r1_3 = _mm_add_epi32(r1_3, state3);
756 r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
758 r2_0 = _mm_add_epi32(r2_0, state0);
759 r2_1 = _mm_add_epi32(r2_1, state1);
760 r2_2 = _mm_add_epi32(r2_2, state2);
761 r2_3 = _mm_add_epi32(r2_3, state3);
762 r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
764 r3_0 = _mm_add_epi32(r3_0, state0);
765 r3_1 = _mm_add_epi32(r3_1, state1);
766 r3_2 = _mm_add_epi32(r3_2, state2);
767 r3_3 = _mm_add_epi32(r3_3, state3);
768 r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
772 r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0);
773 r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1);
774 r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2);
775 r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3);
778 _mm_storeu_si128(output_mm + 0, r0_0);
779 _mm_storeu_si128(output_mm + 1, r0_1);
780 _mm_storeu_si128(output_mm + 2, r0_2);
781 _mm_storeu_si128(output_mm + 3, r0_3);
785 r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0);
786 r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1);
787 r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2);
788 r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3);
791 _mm_storeu_si128(output_mm + 4, r1_0);
792 _mm_storeu_si128(output_mm + 5, r1_1);
793 _mm_storeu_si128(output_mm + 6, r1_2);
794 _mm_storeu_si128(output_mm + 7, r1_3);
798 r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0);
799 r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1);
800 r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2);
801 r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3);
804 _mm_storeu_si128(output_mm + 8, r2_0);
805 _mm_storeu_si128(output_mm + 9, r2_1);
806 _mm_storeu_si128(output_mm + 10, r2_2);
807 _mm_storeu_si128(output_mm + 11, r2_3);
811 r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0);
812 r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1);
813 r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2);
814 r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3);
817 _mm_storeu_si128(output_mm + 12, r3_0);
818 _mm_storeu_si128(output_mm + 13, r3_1);
819 _mm_storeu_si128(output_mm + 14, r3_2);
820 _mm_storeu_si128(output_mm + 15, r3_3);
823 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE 825 #if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE) 832 inline void ChaCha_OperateKeystream_CORE(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
840 {1,0,0,0}, {2,0,0,0}, {3,0,0,0}
863 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
865 r0_0 =
VecAdd(r0_0, r0_1);
866 r1_0 =
VecAdd(r1_0, r1_1);
867 r2_0 =
VecAdd(r2_0, r2_1);
868 r3_0 =
VecAdd(r3_0, r3_1);
870 r0_3 =
VecXor(r0_3, r0_0);
871 r1_3 =
VecXor(r1_3, r1_0);
872 r2_3 =
VecXor(r2_3, r2_0);
873 r3_3 =
VecXor(r3_3, r3_0);
875 r0_3 = VecRotateLeft<16>(r0_3);
876 r1_3 = VecRotateLeft<16>(r1_3);
877 r2_3 = VecRotateLeft<16>(r2_3);
878 r3_3 = VecRotateLeft<16>(r3_3);
880 r0_2 =
VecAdd(r0_2, r0_3);
881 r1_2 =
VecAdd(r1_2, r1_3);
882 r2_2 =
VecAdd(r2_2, r2_3);
883 r3_2 =
VecAdd(r3_2, r3_3);
885 r0_1 =
VecXor(r0_1, r0_2);
886 r1_1 =
VecXor(r1_1, r1_2);
887 r2_1 =
VecXor(r2_1, r2_2);
888 r3_1 =
VecXor(r3_1, r3_2);
890 r0_1 = VecRotateLeft<12>(r0_1);
891 r1_1 = VecRotateLeft<12>(r1_1);
892 r2_1 = VecRotateLeft<12>(r2_1);
893 r3_1 = VecRotateLeft<12>(r3_1);
895 r0_0 =
VecAdd(r0_0, r0_1);
896 r1_0 =
VecAdd(r1_0, r1_1);
897 r2_0 =
VecAdd(r2_0, r2_1);
898 r3_0 =
VecAdd(r3_0, r3_1);
900 r0_3 =
VecXor(r0_3, r0_0);
901 r1_3 =
VecXor(r1_3, r1_0);
902 r2_3 =
VecXor(r2_3, r2_0);
903 r3_3 =
VecXor(r3_3, r3_0);
905 r0_3 = VecRotateLeft<8>(r0_3);
906 r1_3 = VecRotateLeft<8>(r1_3);
907 r2_3 = VecRotateLeft<8>(r2_3);
908 r3_3 = VecRotateLeft<8>(r3_3);
910 r0_2 =
VecAdd(r0_2, r0_3);
911 r1_2 =
VecAdd(r1_2, r1_3);
912 r2_2 =
VecAdd(r2_2, r2_3);
913 r3_2 =
VecAdd(r3_2, r3_3);
915 r0_1 =
VecXor(r0_1, r0_2);
916 r1_1 =
VecXor(r1_1, r1_2);
917 r2_1 =
VecXor(r2_1, r2_2);
918 r3_1 =
VecXor(r3_1, r3_2);
920 r0_1 = VecRotateLeft<7>(r0_1);
921 r1_1 = VecRotateLeft<7>(r1_1);
922 r2_1 = VecRotateLeft<7>(r2_1);
923 r3_1 = VecRotateLeft<7>(r3_1);
925 r0_1 = Shuffle<1>(r0_1);
926 r0_2 = Shuffle<2>(r0_2);
927 r0_3 = Shuffle<3>(r0_3);
929 r1_1 = Shuffle<1>(r1_1);
930 r1_2 = Shuffle<2>(r1_2);
931 r1_3 = Shuffle<3>(r1_3);
933 r2_1 = Shuffle<1>(r2_1);
934 r2_2 = Shuffle<2>(r2_2);
935 r2_3 = Shuffle<3>(r2_3);
937 r3_1 = Shuffle<1>(r3_1);
938 r3_2 = Shuffle<2>(r3_2);
939 r3_3 = Shuffle<3>(r3_3);
941 r0_0 =
VecAdd(r0_0, r0_1);
942 r1_0 =
VecAdd(r1_0, r1_1);
943 r2_0 =
VecAdd(r2_0, r2_1);
944 r3_0 =
VecAdd(r3_0, r3_1);
946 r0_3 =
VecXor(r0_3, r0_0);
947 r1_3 =
VecXor(r1_3, r1_0);
948 r2_3 =
VecXor(r2_3, r2_0);
949 r3_3 =
VecXor(r3_3, r3_0);
951 r0_3 = VecRotateLeft<16>(r0_3);
952 r1_3 = VecRotateLeft<16>(r1_3);
953 r2_3 = VecRotateLeft<16>(r2_3);
954 r3_3 = VecRotateLeft<16>(r3_3);
956 r0_2 =
VecAdd(r0_2, r0_3);
957 r1_2 =
VecAdd(r1_2, r1_3);
958 r2_2 =
VecAdd(r2_2, r2_3);
959 r3_2 =
VecAdd(r3_2, r3_3);
961 r0_1 =
VecXor(r0_1, r0_2);
962 r1_1 =
VecXor(r1_1, r1_2);
963 r2_1 =
VecXor(r2_1, r2_2);
964 r3_1 =
VecXor(r3_1, r3_2);
966 r0_1 = VecRotateLeft<12>(r0_1);
967 r1_1 = VecRotateLeft<12>(r1_1);
968 r2_1 = VecRotateLeft<12>(r2_1);
969 r3_1 = VecRotateLeft<12>(r3_1);
971 r0_0 =
VecAdd(r0_0, r0_1);
972 r1_0 =
VecAdd(r1_0, r1_1);
973 r2_0 =
VecAdd(r2_0, r2_1);
974 r3_0 =
VecAdd(r3_0, r3_1);
976 r0_3 =
VecXor(r0_3, r0_0);
977 r1_3 =
VecXor(r1_3, r1_0);
978 r2_3 =
VecXor(r2_3, r2_0);
979 r3_3 =
VecXor(r3_3, r3_0);
981 r0_3 = VecRotateLeft<8>(r0_3);
982 r1_3 = VecRotateLeft<8>(r1_3);
983 r2_3 = VecRotateLeft<8>(r2_3);
984 r3_3 = VecRotateLeft<8>(r3_3);
986 r0_2 =
VecAdd(r0_2, r0_3);
987 r1_2 =
VecAdd(r1_2, r1_3);
988 r2_2 =
VecAdd(r2_2, r2_3);
989 r3_2 =
VecAdd(r3_2, r3_3);
991 r0_1 =
VecXor(r0_1, r0_2);
992 r1_1 =
VecXor(r1_1, r1_2);
993 r2_1 =
VecXor(r2_1, r2_2);
994 r3_1 =
VecXor(r3_1, r3_2);
996 r0_1 = VecRotateLeft<7>(r0_1);
997 r1_1 = VecRotateLeft<7>(r1_1);
998 r2_1 = VecRotateLeft<7>(r2_1);
999 r3_1 = VecRotateLeft<7>(r3_1);
1001 r0_1 = Shuffle<3>(r0_1);
1002 r0_2 = Shuffle<2>(r0_2);
1003 r0_3 = Shuffle<1>(r0_3);
1005 r1_1 = Shuffle<3>(r1_1);
1006 r1_2 = Shuffle<2>(r1_2);
1007 r1_3 = Shuffle<1>(r1_3);
1009 r2_1 = Shuffle<3>(r2_1);
1010 r2_2 = Shuffle<2>(r2_2);
1011 r2_3 = Shuffle<1>(r2_3);
1013 r3_1 = Shuffle<3>(r3_1);
1014 r3_2 = Shuffle<2>(r3_2);
1015 r3_3 = Shuffle<1>(r3_3);
1018 r0_0 =
VecAdd(r0_0, state0);
1019 r0_1 =
VecAdd(r0_1, state1);
1020 r0_2 =
VecAdd(r0_2, state2);
1021 r0_3 =
VecAdd(r0_3, state3);
1023 r1_0 =
VecAdd(r1_0, state0);
1024 r1_1 =
VecAdd(r1_1, state1);
1025 r1_2 =
VecAdd(r1_2, state2);
1026 r1_3 =
VecAdd(r1_3, state3);
1029 r2_0 =
VecAdd(r2_0, state0);
1030 r2_1 =
VecAdd(r2_1, state1);
1031 r2_2 =
VecAdd(r2_2, state2);
1032 r2_3 =
VecAdd(r2_3, state3);
1035 r3_0 =
VecAdd(r3_0, state0);
1036 r3_1 =
VecAdd(r3_1, state1);
1037 r3_2 =
VecAdd(r3_2, state2);
1038 r3_3 =
VecAdd(r3_3, state3);
1043 r0_0 =
VecXor(VecLoad32LE(input + 0*16), r0_0);
1044 r0_1 =
VecXor(VecLoad32LE(input + 1*16), r0_1);
1045 r0_2 =
VecXor(VecLoad32LE(input + 2*16), r0_2);
1046 r0_3 =
VecXor(VecLoad32LE(input + 3*16), r0_3);
1049 VecStore32LE(output + 0*16, r0_0);
1050 VecStore32LE(output + 1*16, r0_1);
1051 VecStore32LE(output + 2*16, r0_2);
1052 VecStore32LE(output + 3*16, r0_3);
1056 r1_0 =
VecXor(VecLoad32LE(input + 4*16), r1_0);
1057 r1_1 =
VecXor(VecLoad32LE(input + 5*16), r1_1);
1058 r1_2 =
VecXor(VecLoad32LE(input + 6*16), r1_2);
1059 r1_3 =
VecXor(VecLoad32LE(input + 7*16), r1_3);
1062 VecStore32LE(output + 4*16, r1_0);
1063 VecStore32LE(output + 5*16, r1_1);
1064 VecStore32LE(output + 6*16, r1_2);
1065 VecStore32LE(output + 7*16, r1_3);
1069 r2_0 =
VecXor(VecLoad32LE(input + 8*16), r2_0);
1070 r2_1 =
VecXor(VecLoad32LE(input + 9*16), r2_1);
1071 r2_2 =
VecXor(VecLoad32LE(input + 10*16), r2_2);
1072 r2_3 =
VecXor(VecLoad32LE(input + 11*16), r2_3);
1075 VecStore32LE(output + 8*16, r2_0);
1076 VecStore32LE(output + 9*16, r2_1);
1077 VecStore32LE(output + 10*16, r2_2);
1078 VecStore32LE(output + 11*16, r2_3);
1082 r3_0 =
VecXor(VecLoad32LE(input + 12*16), r3_0);
1083 r3_1 =
VecXor(VecLoad32LE(input + 13*16), r3_1);
1084 r3_2 =
VecXor(VecLoad32LE(input + 14*16), r3_2);
1085 r3_3 =
VecXor(VecLoad32LE(input + 15*16), r3_3);
1088 VecStore32LE(output + 12*16, r3_0);
1089 VecStore32LE(output + 13*16, r3_1);
1090 VecStore32LE(output + 14*16, r3_2);
1091 VecStore32LE(output + 15*16, r3_3);
1094 #endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE 1096 #if (CRYPTOPP_POWER7_AVAILABLE) 1098 void ChaCha_OperateKeystream_POWER7(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
1100 ChaCha_OperateKeystream_CORE(state, input, output, rounds);
1103 #elif (CRYPTOPP_ALTIVEC_AVAILABLE) 1105 void ChaCha_OperateKeystream_ALTIVEC(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
1107 ChaCha_OperateKeystream_CORE(state, input, output, rounds);
Utility functions for the Crypto++ library.
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.