45 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES 46 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES 52 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 53 # include <arm_neon.h> 56 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 58 # include <arm_acle.h> 61 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) 62 # include <emmintrin.h> 63 # include <xmmintrin.h> 67 #if (CRYPTOPP_SSSE3_AVAILABLE) 68 # include <emmintrin.h> 69 # include <pmmintrin.h> 70 # include <xmmintrin.h> 73 #if defined(__ALTIVEC__) 79 ANONYMOUS_NAMESPACE_BEGIN
81 using CryptoPP::BlockTransformation;
89 ANONYMOUS_NAMESPACE_END
93 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 105 template <
typename F2,
typename F6,
typename W>
106 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
107 const W *subKeys,
size_t rounds,
const byte *inBlocks,
108 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
115 #if (CRYPTOPP_LITTLE_ENDIAN) 116 const uint32x4_t s_one = {0, 0, 0, 1<<24};
117 const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
120 const uint32x4_t s_one = {0, 0, 0, 1};
121 const uint32x4_t s_two = {0, 2, 0, 2};
124 const size_t blockSize = 8;
125 const size_t neonBlockSize = 16;
127 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
128 size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
129 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
132 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
133 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
135 if (flags & BT_ReverseDirection)
137 inBlocks =
PtrAdd(inBlocks, length - neonBlockSize);
138 xorBlocks =
PtrAdd(xorBlocks, length - neonBlockSize);
139 outBlocks =
PtrAdd(outBlocks, length - neonBlockSize);
140 inIncrement = 0-inIncrement;
141 xorIncrement = 0-xorIncrement;
142 outIncrement = 0-outIncrement;
145 if (flags & BT_AllowParallel)
147 while (length >= 6*neonBlockSize)
149 uint32x4_t block0, block1, block2, block3, block4, block5;
150 if (flags & BT_InBlockIsCounter)
155 const uint8x8_t ctr = vld1_u8(inBlocks);
156 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
159 block1 = vaddq_u32(s_two, block0);
160 block2 = vaddq_u32(s_two, block1);
161 block3 = vaddq_u32(s_two, block2);
162 block4 = vaddq_u32(s_two, block3);
163 block5 = vaddq_u32(s_two, block4);
165 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
166 vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
170 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
171 inBlocks =
PtrAdd(inBlocks, inIncrement);
172 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
173 inBlocks =
PtrAdd(inBlocks, inIncrement);
174 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
175 inBlocks =
PtrAdd(inBlocks, inIncrement);
176 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
177 inBlocks =
PtrAdd(inBlocks, inIncrement);
178 block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
179 inBlocks =
PtrAdd(inBlocks, inIncrement);
180 block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
181 inBlocks =
PtrAdd(inBlocks, inIncrement);
186 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
187 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
188 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
189 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
190 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
191 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
192 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
193 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
194 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
195 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
196 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
197 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
200 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
204 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
205 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
206 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
207 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
208 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
209 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
210 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
211 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
212 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
213 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
214 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
215 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
218 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
219 outBlocks =
PtrAdd(outBlocks, outIncrement);
220 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
221 outBlocks =
PtrAdd(outBlocks, outIncrement);
222 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
223 outBlocks =
PtrAdd(outBlocks, outIncrement);
224 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
225 outBlocks =
PtrAdd(outBlocks, outIncrement);
226 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
227 outBlocks =
PtrAdd(outBlocks, outIncrement);
228 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
229 outBlocks =
PtrAdd(outBlocks, outIncrement);
231 length -= 6*neonBlockSize;
234 while (length >= 2*neonBlockSize)
236 uint32x4_t block0, block1;
237 if (flags & BT_InBlockIsCounter)
242 const uint8x8_t ctr = vld1_u8(inBlocks);
243 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
246 block1 = vaddq_u32(s_two, block0);
248 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
249 vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
253 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
254 inBlocks =
PtrAdd(inBlocks, inIncrement);
255 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
256 inBlocks =
PtrAdd(inBlocks, inIncrement);
261 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
262 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
263 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
264 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
267 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
271 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
272 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
273 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
274 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
277 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
278 outBlocks =
PtrAdd(outBlocks, outIncrement);
279 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
280 outBlocks =
PtrAdd(outBlocks, outIncrement);
282 length -= 2*neonBlockSize;
289 if (flags & BT_ReverseDirection)
291 inIncrement += inIncrement ? blockSize : 0;
292 xorIncrement += xorIncrement ? blockSize : 0;
293 outIncrement += outIncrement ? blockSize : 0;
294 inBlocks =
PtrSub(inBlocks, inIncrement);
295 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
296 outBlocks =
PtrSub(outBlocks, outIncrement);
300 inIncrement -= inIncrement ? blockSize : 0;
301 xorIncrement -= xorIncrement ? blockSize : 0;
302 outIncrement -= outIncrement ? blockSize : 0;
305 while (length >= blockSize)
307 uint32x4_t block, zero = {0};
309 const uint8x8_t v = vld1_u8(inBlocks);
310 block = vreinterpretq_u32_u8(vcombine_u8(v,v));
314 const uint8x8_t x = vld1_u8(xorBlocks);
315 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
318 if (flags & BT_InBlockIsCounter)
319 const_cast<byte *
>(inBlocks)[7]++;
321 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
325 const uint8x8_t x = vld1_u8(xorBlocks);
326 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
329 vst1_u8(const_cast<byte*>(outBlocks),
330 vget_low_u8(vreinterpretq_u8_u32(block)));
332 inBlocks =
PtrAdd(inBlocks, inIncrement);
333 outBlocks =
PtrAdd(outBlocks, outIncrement);
334 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
350 template <
typename F1,
typename F6,
typename W>
351 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
352 const W *subKeys,
size_t rounds,
const byte *inBlocks,
353 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
360 #if (CRYPTOPP_LITTLE_ENDIAN) 361 const uint32x4_t s_one = {0, 0, 0, 1<<24};
365 const uint32x4_t s_one = {0, 0, 0, 1};
369 const size_t blockSize = 16;
372 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
373 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
374 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
377 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
378 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
380 if (flags & BT_ReverseDirection)
382 inBlocks =
PtrAdd(inBlocks, length - blockSize);
383 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
384 outBlocks =
PtrAdd(outBlocks, length - blockSize);
385 inIncrement = 0-inIncrement;
386 xorIncrement = 0-xorIncrement;
387 outIncrement = 0-outIncrement;
390 if (flags & BT_AllowParallel)
392 while (length >= 6*blockSize)
394 uint64x2_t block0, block1, block2, block3, block4, block5;
395 if (flags & BT_InBlockIsCounter)
397 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
398 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
399 block1 = vaddq_u64(block0, one);
400 block2 = vaddq_u64(block1, one);
401 block3 = vaddq_u64(block2, one);
402 block4 = vaddq_u64(block3, one);
403 block5 = vaddq_u64(block4, one);
404 vst1q_u8(const_cast<byte*>(inBlocks),
405 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
409 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
410 inBlocks =
PtrAdd(inBlocks, inIncrement);
411 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
412 inBlocks =
PtrAdd(inBlocks, inIncrement);
413 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
414 inBlocks =
PtrAdd(inBlocks, inIncrement);
415 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
416 inBlocks =
PtrAdd(inBlocks, inIncrement);
417 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
418 inBlocks =
PtrAdd(inBlocks, inIncrement);
419 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
420 inBlocks =
PtrAdd(inBlocks, inIncrement);
425 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
426 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
427 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
428 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
429 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
430 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
431 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
432 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
433 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
434 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
435 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
436 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
439 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
443 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
444 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
445 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
446 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
447 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
448 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
449 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
450 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
451 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
452 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
453 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
454 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
457 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
458 outBlocks =
PtrAdd(outBlocks, outIncrement);
459 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
460 outBlocks =
PtrAdd(outBlocks, outIncrement);
461 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
462 outBlocks =
PtrAdd(outBlocks, outIncrement);
463 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
464 outBlocks =
PtrAdd(outBlocks, outIncrement);
465 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
466 outBlocks =
PtrAdd(outBlocks, outIncrement);
467 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
468 outBlocks =
PtrAdd(outBlocks, outIncrement);
470 length -= 6*blockSize;
474 while (length >= blockSize)
477 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
480 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
482 if (flags & BT_InBlockIsCounter)
483 const_cast<byte *
>(inBlocks)[15]++;
485 func1(block, subKeys, static_cast<unsigned int>(rounds));
488 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
490 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
492 inBlocks =
PtrAdd(inBlocks, inIncrement);
493 outBlocks =
PtrAdd(outBlocks, outIncrement);
494 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
512 template <
typename F1,
typename F4,
typename W,
typename V>
513 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
514 const V& unused,
const W *subKeys,
size_t rounds,
const byte *inBlocks,
515 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
521 CRYPTOPP_UNUSED(unused);
523 #if (CRYPTOPP_LITTLE_ENDIAN) 524 const uint32x4_t s_one = {0, 0, 0, 1<<24};
528 const uint32x4_t s_one = {0, 0, 0, 1};
532 const size_t blockSize = 16;
535 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
536 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
537 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
540 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
541 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
543 if (flags & BT_ReverseDirection)
545 inBlocks =
PtrAdd(inBlocks, length - blockSize);
546 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
547 outBlocks =
PtrAdd(outBlocks, length - blockSize);
548 inIncrement = 0-inIncrement;
549 xorIncrement = 0-xorIncrement;
550 outIncrement = 0-outIncrement;
553 if (flags & BT_AllowParallel)
555 while (length >= 4*blockSize)
557 uint64x2_t block0, block1, block2, block3;
558 if (flags & BT_InBlockIsCounter)
560 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
561 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
562 block1 = vaddq_u64(block0, one);
563 block2 = vaddq_u64(block1, one);
564 block3 = vaddq_u64(block2, one);
565 vst1q_u8(const_cast<byte*>(inBlocks),
566 vreinterpretq_u8_u64(vaddq_u64(block3, one)));
570 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
571 inBlocks =
PtrAdd(inBlocks, inIncrement);
572 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
573 inBlocks =
PtrAdd(inBlocks, inIncrement);
574 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
575 inBlocks =
PtrAdd(inBlocks, inIncrement);
576 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
577 inBlocks =
PtrAdd(inBlocks, inIncrement);
582 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
583 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
584 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
585 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
586 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
587 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
588 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
589 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
592 func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
596 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
597 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
598 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
599 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
600 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
601 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
602 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
603 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
606 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
607 outBlocks =
PtrAdd(outBlocks, outIncrement);
608 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
609 outBlocks =
PtrAdd(outBlocks, outIncrement);
610 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
611 outBlocks =
PtrAdd(outBlocks, outIncrement);
612 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
613 outBlocks =
PtrAdd(outBlocks, outIncrement);
615 length -= 4*blockSize;
619 while (length >= blockSize)
621 uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
624 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
626 if (flags & BT_InBlockIsCounter)
627 const_cast<byte *
>(inBlocks)[15]++;
629 func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
632 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
634 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
636 inBlocks =
PtrAdd(inBlocks, inIncrement);
637 outBlocks =
PtrAdd(outBlocks, outIncrement);
638 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
653 template <
typename F2,
typename F6,
typename W>
654 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
655 const W *subKeys,
size_t rounds,
const byte *inBlocks,
656 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
663 #if (CRYPTOPP_LITTLE_ENDIAN) 664 const uint32x4_t s_one = {0, 0, 0, 1<<24};
668 const uint32x4_t s_one = {0, 0, 0, 1};
672 const size_t blockSize = 16;
675 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
676 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
677 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
680 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
681 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
683 if (flags & BT_ReverseDirection)
685 inBlocks =
PtrAdd(inBlocks, length - blockSize);
686 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
687 outBlocks =
PtrAdd(outBlocks, length - blockSize);
688 inIncrement = 0-inIncrement;
689 xorIncrement = 0-xorIncrement;
690 outIncrement = 0-outIncrement;
693 if (flags & BT_AllowParallel)
695 while (length >= 6*blockSize)
697 uint64x2_t block0, block1, block2, block3, block4, block5;
698 if (flags & BT_InBlockIsCounter)
700 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
701 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
702 block1 = vaddq_u64(block0, one);
703 block2 = vaddq_u64(block1, one);
704 block3 = vaddq_u64(block2, one);
705 block4 = vaddq_u64(block3, one);
706 block5 = vaddq_u64(block4, one);
707 vst1q_u8(const_cast<byte*>(inBlocks),
708 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
712 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
713 inBlocks =
PtrAdd(inBlocks, inIncrement);
714 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
715 inBlocks =
PtrAdd(inBlocks, inIncrement);
716 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
717 inBlocks =
PtrAdd(inBlocks, inIncrement);
718 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
719 inBlocks =
PtrAdd(inBlocks, inIncrement);
720 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
721 inBlocks =
PtrAdd(inBlocks, inIncrement);
722 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
723 inBlocks =
PtrAdd(inBlocks, inIncrement);
728 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
729 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
730 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
731 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
732 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
733 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
734 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
735 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
736 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
738 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
742 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
746 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
747 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
748 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
749 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
750 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
751 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
752 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
753 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
754 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
755 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
756 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
757 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
760 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
761 outBlocks =
PtrAdd(outBlocks, outIncrement);
762 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
763 outBlocks =
PtrAdd(outBlocks, outIncrement);
764 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
765 outBlocks =
PtrAdd(outBlocks, outIncrement);
766 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
767 outBlocks =
PtrAdd(outBlocks, outIncrement);
768 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
769 outBlocks =
PtrAdd(outBlocks, outIncrement);
770 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
771 outBlocks =
PtrAdd(outBlocks, outIncrement);
773 length -= 6*blockSize;
776 while (length >= 2*blockSize)
778 uint64x2_t block0, block1;
779 if (flags & BT_InBlockIsCounter)
781 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
782 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
783 block1 = vaddq_u64(block0, one);
784 vst1q_u8(const_cast<byte*>(inBlocks),
785 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
789 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
790 inBlocks =
PtrAdd(inBlocks, inIncrement);
791 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
792 inBlocks =
PtrAdd(inBlocks, inIncrement);
797 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
798 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
799 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
800 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
803 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
807 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
808 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
809 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
810 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
813 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
814 outBlocks =
PtrAdd(outBlocks, outIncrement);
815 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
816 outBlocks =
PtrAdd(outBlocks, outIncrement);
818 length -= 2*blockSize;
822 while (length >= blockSize)
824 uint64x2_t block, zero = {0,0};
825 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
828 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
830 if (flags & BT_InBlockIsCounter)
831 const_cast<byte *
>(inBlocks)[15]++;
833 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
836 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
838 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
840 inBlocks =
PtrAdd(inBlocks, inIncrement);
841 outBlocks =
PtrAdd(outBlocks, outIncrement);
842 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
851 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 855 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 858 #if (__SUNPRO_CC >= 0x5130) 860 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) 862 # define MAYBE_CONST const 863 # define MAYBE_UNCONST_CAST(T, x) (x) 868 # define M128_CAST(x) ((__m128i *)(void *)(x)) 870 #ifndef CONST_M128_CAST 871 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 884 template <
typename F1,
typename F2,
typename W>
885 inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
886 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
887 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
894 const size_t blockSize = 8;
895 const size_t xmmBlockSize = 16;
897 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
898 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
899 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
902 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
903 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
905 if (flags & BT_ReverseDirection)
907 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
908 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
909 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
910 inIncrement = 0-inIncrement;
911 xorIncrement = 0-xorIncrement;
912 outIncrement = 0-outIncrement;
915 if (flags & BT_AllowParallel)
918 while (length >= 2*xmmBlockSize)
920 __m128i block0, block1;
921 if (flags & BT_InBlockIsCounter)
924 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
925 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
930 std::memcpy(temp, inBlocks, blockSize);
931 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
934 block1 = _mm_add_epi32(s_two, block0);
938 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
939 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
943 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
944 inBlocks =
PtrAdd(inBlocks, inIncrement);
945 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
946 inBlocks =
PtrAdd(inBlocks, inIncrement);
951 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
952 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
953 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
954 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
957 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
961 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
962 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
963 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
964 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
967 _mm_storeu_si128(M128_CAST(outBlocks), block0);
968 outBlocks =
PtrAdd(outBlocks, outIncrement);
969 _mm_storeu_si128(M128_CAST(outBlocks), block1);
970 outBlocks =
PtrAdd(outBlocks, outIncrement);
972 length -= 2*xmmBlockSize;
979 if (flags & BT_ReverseDirection)
981 inIncrement += inIncrement ? blockSize : 0;
982 xorIncrement += xorIncrement ? blockSize : 0;
983 outIncrement += outIncrement ? blockSize : 0;
984 inBlocks =
PtrSub(inBlocks, inIncrement);
985 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
986 outBlocks =
PtrSub(outBlocks, outIncrement);
990 inIncrement -= inIncrement ? blockSize : 0;
991 xorIncrement -= xorIncrement ? blockSize : 0;
992 outIncrement -= outIncrement ? blockSize : 0;
995 while (length >= blockSize)
998 std::memcpy(temp, inBlocks, blockSize);
999 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1003 std::memcpy(temp, xorBlocks, blockSize);
1004 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1007 if (flags & BT_InBlockIsCounter)
1008 const_cast<byte *
>(inBlocks)[7]++;
1010 func1(block, subKeys, static_cast<unsigned int>(rounds));
1014 std::memcpy(temp, xorBlocks, blockSize);
1015 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1018 _mm_store_sd(temp, _mm_castsi128_pd(block));
1019 std::memcpy(outBlocks, temp, blockSize);
1021 inBlocks =
PtrAdd(inBlocks, inIncrement);
1022 outBlocks =
PtrAdd(outBlocks, outIncrement);
1023 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1024 length -= blockSize;
1039 template <
typename F2,
typename F6,
typename W>
1040 inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1041 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1042 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1049 const size_t blockSize = 8;
1050 const size_t xmmBlockSize = 16;
1052 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1053 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1054 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1057 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1058 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1060 if (flags & BT_ReverseDirection)
1062 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
1063 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
1064 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
1065 inIncrement = 0-inIncrement;
1066 xorIncrement = 0-xorIncrement;
1067 outIncrement = 0-outIncrement;
1070 if (flags & BT_AllowParallel)
1073 while (length >= 6*xmmBlockSize)
1075 __m128i block0, block1, block2, block3, block4, block5;
1076 if (flags & BT_InBlockIsCounter)
1079 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1080 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1085 std::memcpy(temp, inBlocks, blockSize);
1086 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1089 block1 = _mm_add_epi32(s_two, block0);
1090 block2 = _mm_add_epi32(s_two, block1);
1091 block3 = _mm_add_epi32(s_two, block2);
1092 block4 = _mm_add_epi32(s_two, block3);
1093 block5 = _mm_add_epi32(s_two, block4);
1097 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1098 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1102 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1103 inBlocks =
PtrAdd(inBlocks, inIncrement);
1104 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1105 inBlocks =
PtrAdd(inBlocks, inIncrement);
1106 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1107 inBlocks =
PtrAdd(inBlocks, inIncrement);
1108 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1109 inBlocks =
PtrAdd(inBlocks, inIncrement);
1110 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1111 inBlocks =
PtrAdd(inBlocks, inIncrement);
1112 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1113 inBlocks =
PtrAdd(inBlocks, inIncrement);
1118 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1119 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1120 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1121 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1122 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1123 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1124 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1125 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1126 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1128 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1132 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1136 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1137 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1138 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1139 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1140 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1141 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1142 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1143 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1144 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1145 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1146 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1147 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1150 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1151 outBlocks =
PtrAdd(outBlocks, outIncrement);
1152 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1153 outBlocks =
PtrAdd(outBlocks, outIncrement);
1154 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1155 outBlocks =
PtrAdd(outBlocks, outIncrement);
1156 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1157 outBlocks =
PtrAdd(outBlocks, outIncrement);
1158 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1159 outBlocks =
PtrAdd(outBlocks, outIncrement);
1160 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1161 outBlocks =
PtrAdd(outBlocks, outIncrement);
1163 length -= 6*xmmBlockSize;
1166 while (length >= 2*xmmBlockSize)
1168 __m128i block0, block1;
1169 if (flags & BT_InBlockIsCounter)
1172 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1173 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1178 std::memcpy(temp, inBlocks, blockSize);
1179 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1182 block1 = _mm_add_epi32(s_two, block0);
1186 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1187 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1191 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1192 inBlocks =
PtrAdd(inBlocks, inIncrement);
1193 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1194 inBlocks =
PtrAdd(inBlocks, inIncrement);
1199 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1200 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1201 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1202 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1205 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1209 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1210 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1211 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1212 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1215 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1216 outBlocks =
PtrAdd(outBlocks, outIncrement);
1217 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1218 outBlocks =
PtrAdd(outBlocks, outIncrement);
1220 length -= 2*xmmBlockSize;
1227 if (flags & BT_ReverseDirection)
1229 inIncrement += inIncrement ? blockSize : 0;
1230 xorIncrement += xorIncrement ? blockSize : 0;
1231 outIncrement += outIncrement ? blockSize : 0;
1232 inBlocks =
PtrSub(inBlocks, inIncrement);
1233 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1234 outBlocks =
PtrSub(outBlocks, outIncrement);
1238 inIncrement -= inIncrement ? blockSize : 0;
1239 xorIncrement -= xorIncrement ? blockSize : 0;
1240 outIncrement -= outIncrement ? blockSize : 0;
1243 while (length >= blockSize)
1246 __m128i block, zero = _mm_setzero_si128();
1247 std::memcpy(temp, inBlocks, blockSize);
1248 block = _mm_castpd_si128(_mm_load_sd(temp));
1252 std::memcpy(temp, xorBlocks, blockSize);
1253 block = _mm_xor_si128(block,
1254 _mm_castpd_si128(_mm_load_sd(temp)));
1257 if (flags & BT_InBlockIsCounter)
1258 const_cast<byte *
>(inBlocks)[7]++;
1260 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1264 std::memcpy(temp, xorBlocks, blockSize);
1265 block = _mm_xor_si128(block,
1266 _mm_castpd_si128(_mm_load_sd(temp)));
1269 _mm_store_sd(temp, _mm_castsi128_pd(block));
1270 std::memcpy(outBlocks, temp, blockSize);
1272 inBlocks =
PtrAdd(inBlocks, inIncrement);
1273 outBlocks =
PtrAdd(outBlocks, outIncrement);
1274 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1275 length -= blockSize;
1290 template <
typename F2,
typename F6,
typename W>
1291 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1292 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1293 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1300 const size_t blockSize = 16;
1303 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1304 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1305 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1308 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1309 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1311 if (flags & BT_ReverseDirection)
1313 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1314 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1315 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1316 inIncrement = 0-inIncrement;
1317 xorIncrement = 0-xorIncrement;
1318 outIncrement = 0-outIncrement;
1321 if (flags & BT_AllowParallel)
1323 while (length >= 6*blockSize)
1325 __m128i block0, block1, block2, block3, block4, block5;
1326 if (flags & BT_InBlockIsCounter)
1329 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1330 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1331 block1 = _mm_add_epi32(block0, s_one);
1332 block2 = _mm_add_epi32(block1, s_one);
1333 block3 = _mm_add_epi32(block2, s_one);
1334 block4 = _mm_add_epi32(block3, s_one);
1335 block5 = _mm_add_epi32(block4, s_one);
1336 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1340 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1341 inBlocks =
PtrAdd(inBlocks, inIncrement);
1342 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1343 inBlocks =
PtrAdd(inBlocks, inIncrement);
1344 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1345 inBlocks =
PtrAdd(inBlocks, inIncrement);
1346 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1347 inBlocks =
PtrAdd(inBlocks, inIncrement);
1348 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1349 inBlocks =
PtrAdd(inBlocks, inIncrement);
1350 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1351 inBlocks =
PtrAdd(inBlocks, inIncrement);
1356 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1357 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1358 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1359 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1360 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1361 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1362 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1363 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1364 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1366 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1370 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1374 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1375 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1376 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1377 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1378 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1379 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1380 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1381 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1382 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1383 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1384 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1385 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1388 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1389 outBlocks =
PtrAdd(outBlocks, outIncrement);
1390 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1391 outBlocks =
PtrAdd(outBlocks, outIncrement);
1392 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1393 outBlocks =
PtrAdd(outBlocks, outIncrement);
1394 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1395 outBlocks =
PtrAdd(outBlocks, outIncrement);
1396 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1397 outBlocks =
PtrAdd(outBlocks, outIncrement);
1398 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1399 outBlocks =
PtrAdd(outBlocks, outIncrement);
1401 length -= 6*blockSize;
1404 while (length >= 2*blockSize)
1406 __m128i block0, block1;
1407 if (flags & BT_InBlockIsCounter)
1410 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1411 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1412 block1 = _mm_add_epi32(block0, s_one);
1413 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1417 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1418 inBlocks =
PtrAdd(inBlocks, inIncrement);
1419 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1420 inBlocks =
PtrAdd(inBlocks, inIncrement);
1425 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1426 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1427 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1428 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1431 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1435 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1436 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1437 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1438 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1441 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1442 outBlocks =
PtrAdd(outBlocks, outIncrement);
1443 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1444 outBlocks =
PtrAdd(outBlocks, outIncrement);
1446 length -= 2*blockSize;
1450 while (length >= blockSize)
1452 __m128i block, zero = _mm_setzero_si128();
1453 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1456 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1458 if (flags & BT_InBlockIsCounter)
1459 const_cast<byte *
>(inBlocks)[15]++;
1461 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1464 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1466 _mm_storeu_si128(M128_CAST(outBlocks), block);
1468 inBlocks =
PtrAdd(inBlocks, inIncrement);
1469 outBlocks =
PtrAdd(outBlocks, outIncrement);
1470 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1471 length -= blockSize;
1485 template <
typename F1,
typename F4,
typename W>
1486 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1487 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1488 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1495 const size_t blockSize = 16;
1498 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1499 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1500 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1503 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1504 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1506 if (flags & BT_ReverseDirection)
1508 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1509 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1510 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1511 inIncrement = 0-inIncrement;
1512 xorIncrement = 0-xorIncrement;
1513 outIncrement = 0-outIncrement;
1516 if (flags & BT_AllowParallel)
1518 while (length >= 4*blockSize)
1520 __m128i block0, block1, block2, block3;
1521 if (flags & BT_InBlockIsCounter)
1524 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1525 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1526 block1 = _mm_add_epi32(block0, s_one);
1527 block2 = _mm_add_epi32(block1, s_one);
1528 block3 = _mm_add_epi32(block2, s_one);
1529 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1533 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1534 inBlocks =
PtrAdd(inBlocks, inIncrement);
1535 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1536 inBlocks =
PtrAdd(inBlocks, inIncrement);
1537 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1538 inBlocks =
PtrAdd(inBlocks, inIncrement);
1539 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1540 inBlocks =
PtrAdd(inBlocks, inIncrement);
1545 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1546 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1547 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1548 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1549 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1550 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1551 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1552 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1555 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1559 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1560 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1561 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1562 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1563 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1564 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1565 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1566 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1569 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1570 outBlocks =
PtrAdd(outBlocks, outIncrement);
1571 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1572 outBlocks =
PtrAdd(outBlocks, outIncrement);
1573 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1574 outBlocks =
PtrAdd(outBlocks, outIncrement);
1575 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1576 outBlocks =
PtrAdd(outBlocks, outIncrement);
1578 length -= 4*blockSize;
1582 while (length >= blockSize)
1584 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1587 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1589 if (flags & BT_InBlockIsCounter)
1590 const_cast<byte *
>(inBlocks)[15]++;
1592 func1(block, subKeys, static_cast<unsigned int>(rounds));
1595 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1597 _mm_storeu_si128(M128_CAST(outBlocks), block);
1599 inBlocks =
PtrAdd(inBlocks, inIncrement);
1600 outBlocks =
PtrAdd(outBlocks, outIncrement);
1601 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1602 length -= blockSize;
1616 template <
typename F1,
typename F4,
typename W>
1617 inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1618 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1619 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1626 const size_t blockSize = 8;
1627 const size_t xmmBlockSize = 16;
1629 size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1630 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1631 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1634 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1635 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1637 if (flags & BT_ReverseDirection)
1639 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
1640 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
1641 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
1642 inIncrement = 0 - inIncrement;
1643 xorIncrement = 0 - xorIncrement;
1644 outIncrement = 0 - outIncrement;
1647 if (flags & BT_AllowParallel)
1649 while (length >= 4*xmmBlockSize)
1651 __m128i block0, block1, block2, block3;
1652 if (flags & BT_InBlockIsCounter)
1655 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1656 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1662 std::memcpy(temp, inBlocks, blockSize);
1663 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1666 block1 = _mm_add_epi32(s_two, block0);
1667 block2 = _mm_add_epi32(s_two, block1);
1668 block3 = _mm_add_epi32(s_two, block2);
1672 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1673 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1677 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1678 inBlocks =
PtrAdd(inBlocks, inIncrement);
1679 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1680 inBlocks =
PtrAdd(inBlocks, inIncrement);
1681 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1682 inBlocks =
PtrAdd(inBlocks, inIncrement);
1683 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1684 inBlocks =
PtrAdd(inBlocks, inIncrement);
1689 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1690 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1691 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1692 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1693 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1694 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1695 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1696 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1699 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1703 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1704 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1705 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1706 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1707 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1708 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1709 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1710 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1713 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1714 outBlocks =
PtrAdd(outBlocks, outIncrement);
1715 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1716 outBlocks =
PtrAdd(outBlocks, outIncrement);
1717 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1718 outBlocks =
PtrAdd(outBlocks, outIncrement);
1719 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1720 outBlocks =
PtrAdd(outBlocks, outIncrement);
1722 length -= 4*xmmBlockSize;
1729 if (flags & BT_ReverseDirection)
1731 inIncrement += inIncrement ? blockSize : 0;
1732 xorIncrement += xorIncrement ? blockSize : 0;
1733 outIncrement += outIncrement ? blockSize : 0;
1734 inBlocks =
PtrSub(inBlocks, inIncrement);
1735 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1736 outBlocks =
PtrSub(outBlocks, outIncrement);
1740 inIncrement -= inIncrement ? blockSize : 0;
1741 xorIncrement -= xorIncrement ? blockSize : 0;
1742 outIncrement -= outIncrement ? blockSize : 0;
1745 while (length >= blockSize)
1748 std::memcpy(temp, inBlocks, blockSize);
1749 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1753 std::memcpy(temp, xorBlocks, blockSize);
1754 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1757 if (flags & BT_InBlockIsCounter)
1758 const_cast<byte *
>(inBlocks)[7]++;
1760 func1(block, subKeys, static_cast<unsigned int>(rounds));
1764 std::memcpy(temp, xorBlocks, blockSize);
1765 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1768 _mm_store_sd(temp, _mm_castsi128_pd(block));
1769 std::memcpy(outBlocks, temp, blockSize);
1771 inBlocks =
PtrAdd(inBlocks, inIncrement);
1772 outBlocks =
PtrAdd(outBlocks, outIncrement);
1773 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1774 length -= blockSize;
1783 #endif // CRYPTOPP_SSSE3_AVAILABLE 1787 #if defined(__ALTIVEC__) 1799 template <
typename F2,
typename F6,
typename W>
1800 inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1801 const W *subKeys,
size_t rounds,
const byte *inBlocks,
1802 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1809 #if (CRYPTOPP_LITTLE_ENDIAN) 1810 enum {LowOffset=8, HighOffset=0};
1814 enum {LowOffset=8, HighOffset=0};
1819 const size_t blockSize = 8;
1820 const size_t vsxBlockSize = 16;
1821 CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1823 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1824 size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1825 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1828 const
bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1829 const
bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1831 if (flags & BT_ReverseDirection)
1833 inBlocks =
PtrAdd(inBlocks, length - vsxBlockSize);
1834 xorBlocks =
PtrAdd(xorBlocks, length - vsxBlockSize);
1835 outBlocks =
PtrAdd(outBlocks, length - vsxBlockSize);
1836 inIncrement = 0-inIncrement;
1837 xorIncrement = 0-xorIncrement;
1838 outIncrement = 0-outIncrement;
1841 if (flags & BT_AllowParallel)
1843 while (length >= 6*vsxBlockSize)
1845 uint32x4_p block0, block1, block2, block3, block4, block5;
1846 if (flags & BT_InBlockIsCounter)
1850 std::memcpy(temp+LowOffset, inBlocks, 8);
1851 std::memcpy(temp+HighOffset, inBlocks, 8);
1858 block0 =
VecAdd(s_one, ctr);
1862 block1 =
VecAdd(s_two, block0);
1863 block2 =
VecAdd(s_two, block1);
1864 block3 =
VecAdd(s_two, block2);
1865 block4 =
VecAdd(s_two, block3);
1866 block5 =
VecAdd(s_two, block4);
1869 const_cast<byte*
>(inBlocks)[7] += 12;
1874 inBlocks =
PtrAdd(inBlocks, inIncrement);
1876 inBlocks =
PtrAdd(inBlocks, inIncrement);
1878 inBlocks =
PtrAdd(inBlocks, inIncrement);
1880 inBlocks =
PtrAdd(inBlocks, inIncrement);
1882 inBlocks =
PtrAdd(inBlocks, inIncrement);
1884 inBlocks =
PtrAdd(inBlocks, inIncrement);
1890 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1892 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1894 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1896 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1898 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1900 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1903 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1908 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1910 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1912 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1914 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1916 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1918 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1922 outBlocks =
PtrAdd(outBlocks, outIncrement);
1924 outBlocks =
PtrAdd(outBlocks, outIncrement);
1926 outBlocks =
PtrAdd(outBlocks, outIncrement);
1928 outBlocks =
PtrAdd(outBlocks, outIncrement);
1930 outBlocks =
PtrAdd(outBlocks, outIncrement);
1932 outBlocks =
PtrAdd(outBlocks, outIncrement);
1934 length -= 6*vsxBlockSize;
1937 while (length >= 2*vsxBlockSize)
1940 if (flags & BT_InBlockIsCounter)
1944 std::memcpy(temp+LowOffset, inBlocks, 8);
1945 std::memcpy(temp+HighOffset, inBlocks, 8);
1952 block0 =
VecAdd(s_one, ctr);
1956 block1 =
VecAdd(s_two, block0);
1959 const_cast<byte*
>(inBlocks)[7] += 4;
1964 inBlocks =
PtrAdd(inBlocks, inIncrement);
1966 inBlocks =
PtrAdd(inBlocks, inIncrement);
1972 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1974 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1977 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1982 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1984 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1988 outBlocks =
PtrAdd(outBlocks, outIncrement);
1990 outBlocks =
PtrAdd(outBlocks, outIncrement);
1992 length -= 2*vsxBlockSize;
1999 if (flags & BT_ReverseDirection)
2001 inIncrement += inIncrement ? blockSize : 0;
2002 xorIncrement += xorIncrement ? blockSize : 0;
2003 outIncrement += outIncrement ? blockSize : 0;
2004 inBlocks =
PtrSub(inBlocks, inIncrement);
2005 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
2006 outBlocks =
PtrSub(outBlocks, outIncrement);
2010 inIncrement -= inIncrement ? blockSize : 0;
2011 xorIncrement -= xorIncrement ? blockSize : 0;
2012 outIncrement -= outIncrement ? blockSize : 0;
2015 while (length >= blockSize)
2023 std::memcpy(temp+LowOffset, inBlocks, 8);
2024 std::memcpy(temp+HighOffset, inBlocks, 8);
2029 std::memcpy(temp+LowOffset, xorBlocks, 8);
2030 std::memcpy(temp+HighOffset, xorBlocks, 8);
2032 block =
VecXor(block, x);
2036 if (flags & BT_InBlockIsCounter)
2037 const_cast<byte *
>(inBlocks)[7]++;
2039 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2043 std::memcpy(temp+LowOffset, xorBlocks, 8);
2044 std::memcpy(temp+HighOffset, xorBlocks, 8);
2046 block =
VecXor(block, x);
2050 std::memcpy(outBlocks, temp+LowOffset, 8);
2052 inBlocks =
PtrAdd(inBlocks, inIncrement);
2053 outBlocks =
PtrAdd(outBlocks, outIncrement);
2054 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2055 length -= blockSize;
2070 template <
typename F1,
typename F4,
typename W>
2071 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2072 const W *subKeys,
size_t rounds,
const byte *inBlocks,
2073 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
2080 #if (CRYPTOPP_LITTLE_ENDIAN) 2086 const size_t blockSize = 16;
2089 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2090 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2091 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2094 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2095 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2097 if (flags & BT_ReverseDirection)
2099 inBlocks =
PtrAdd(inBlocks, length - blockSize);
2100 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
2101 outBlocks =
PtrAdd(outBlocks, length - blockSize);
2102 inIncrement = 0-inIncrement;
2103 xorIncrement = 0-xorIncrement;
2104 outIncrement = 0-outIncrement;
2107 if (flags & BT_AllowParallel)
2109 while (length >= 4*blockSize)
2113 if (flags & BT_InBlockIsCounter)
2116 block1 =
VecAdd(block0, s_one);
2117 block2 =
VecAdd(block1, s_one);
2118 block3 =
VecAdd(block2, s_one);
2128 const_cast<byte*
>(inBlocks)[15] += 6;
2133 inBlocks =
PtrAdd(inBlocks, inIncrement);
2135 inBlocks =
PtrAdd(inBlocks, inIncrement);
2137 inBlocks =
PtrAdd(inBlocks, inIncrement);
2139 inBlocks =
PtrAdd(inBlocks, inIncrement);
2145 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2147 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2149 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2151 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2154 func4(block0, block1, block2, block3, subKeys, rounds);
2159 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2161 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2163 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2165 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2169 outBlocks =
PtrAdd(outBlocks, outIncrement);
2171 outBlocks =
PtrAdd(outBlocks, outIncrement);
2173 outBlocks =
PtrAdd(outBlocks, outIncrement);
2175 outBlocks =
PtrAdd(outBlocks, outIncrement);
2177 length -= 4*blockSize;
2181 while (length >= blockSize)
2188 if (flags & BT_InBlockIsCounter)
2189 const_cast<byte *
>(inBlocks)[15]++;
2191 func1(block, subKeys, rounds);
2198 inBlocks =
PtrAdd(inBlocks, inIncrement);
2199 outBlocks =
PtrAdd(outBlocks, outIncrement);
2200 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2201 length -= blockSize;
2215 template <
typename F1,
typename F6,
typename W>
2216 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2217 const W *subKeys,
size_t rounds,
const byte *inBlocks,
2218 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
2225 #if (CRYPTOPP_LITTLE_ENDIAN) 2231 const size_t blockSize = 16;
2234 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2235 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2236 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2239 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2240 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2242 if (flags & BT_ReverseDirection)
2244 inBlocks =
PtrAdd(inBlocks, length - blockSize);
2245 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
2246 outBlocks =
PtrAdd(outBlocks, length - blockSize);
2247 inIncrement = 0-inIncrement;
2248 xorIncrement = 0-xorIncrement;
2249 outIncrement = 0-outIncrement;
2252 if (flags & BT_AllowParallel)
2254 while (length >= 6*blockSize)
2256 uint32x4_p block0, block1, block2, block3, block4, block5;
2258 if (flags & BT_InBlockIsCounter)
2261 block1 =
VecAdd(block0, s_one);
2262 block2 =
VecAdd(block1, s_one);
2263 block3 =
VecAdd(block2, s_one);
2264 block4 =
VecAdd(block3, s_one);
2265 block5 =
VecAdd(block4, s_one);
2282 VecStoreBE(temp, const_cast<byte*>(inBlocks));
2287 inBlocks =
PtrAdd(inBlocks, inIncrement);
2289 inBlocks =
PtrAdd(inBlocks, inIncrement);
2291 inBlocks =
PtrAdd(inBlocks, inIncrement);
2293 inBlocks =
PtrAdd(inBlocks, inIncrement);
2295 inBlocks =
PtrAdd(inBlocks, inIncrement);
2297 inBlocks =
PtrAdd(inBlocks, inIncrement);
2303 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2305 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2307 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2309 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2311 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2313 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2316 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2321 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2323 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2325 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2327 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2329 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2331 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2335 outBlocks =
PtrAdd(outBlocks, outIncrement);
2337 outBlocks =
PtrAdd(outBlocks, outIncrement);
2339 outBlocks =
PtrAdd(outBlocks, outIncrement);
2341 outBlocks =
PtrAdd(outBlocks, outIncrement);
2343 outBlocks =
PtrAdd(outBlocks, outIncrement);
2345 outBlocks =
PtrAdd(outBlocks, outIncrement);
2347 length -= 6*blockSize;
2351 while (length >= blockSize)
2358 if (flags & BT_InBlockIsCounter)
2359 const_cast<byte *
>(inBlocks)[15]++;
2361 func1(block, subKeys, rounds);
2368 inBlocks =
PtrAdd(inBlocks, inIncrement);
2369 outBlocks =
PtrAdd(outBlocks, outIncrement);
2370 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2371 length -= blockSize;
2379 #endif // __ALTIVEC__ 2381 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.