Crypto++  8.0
Free C++ class library of cryptographic schemes
adv_simd.h
Go to the documentation of this file.
1 // adv_simd.h - written and placed in the public domain by Jeffrey Walton
2 
3 /// \file adv_simd.h
4 /// \brief Template for AdvancedProcessBlocks and SIMD processing
5 
6 // The SIMD based implementations for ciphers that use SSE, NEON and Power7
7 // have a commom pattern. Namely, they have a specialized implementation of
8 // AdvancedProcessBlocks which processes multiple block using hardware
9 // acceleration. After several implementations we noticed a lot of copy and
10 // paste occuring. adv_simd.h provides a template to avoid the copy and paste.
11 //
12 // There are 11 templates provided in this file. The number following the
13 // function name, 64 or 128, is the block size. The name following the block
14 // size is the arrangement and acceleration. For example 4x1_SSE means Intel
15 // SSE using two encrypt (or decrypt) functions: one that operates on 4 SIMD
16 // words, and one that operates on 1 SIMD words.
17 //
18 // The distinction between SIMD words versus cipher blocks is important
19 // because 64-bit ciphers use one SIMD word for two cipher blocks. For
20 // example, AdvancedProcessBlocks64_6x2_ALTIVEC operates on 6 and 2 SIMD
21 // words, which is 12 and 4 cipher blocks. The function will do the right
22 // thing even if there is only one 64-bit block to encrypt.
23 //
24 // * AdvancedProcessBlocks64_2x1_SSE
25 // * AdvancedProcessBlocks64_4x1_SSE
26 // * AdvancedProcessBlocks128_4x1_SSE
27 // * AdvancedProcessBlocks64_6x2_SSE
28 // * AdvancedProcessBlocks128_6x2_SSE
29 // * AdvancedProcessBlocks64_6x2_NEON
30 // * AdvancedProcessBlocks128_4x1_NEON
31 // * AdvancedProcessBlocks128_6x2_NEON
32 // * AdvancedProcessBlocks64_6x2_ALTIVEC
33 // * AdvancedProcessBlocks128_4x1_ALTIVEC
34 // * AdvancedProcessBlocks128_6x1_ALTIVEC
35 //
36 // If an arrangement ends in 2, like 6x2, then the template will handle the
37 // single block case by padding with 0's and using the two SIMD word
38 // function. This happens at most one time when processing multiple blocks.
39 // The extra processing of a zero block is trivial and worth the tradeoff.
40 //
41 // The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
42 // of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
43 // results in a failed link due to the const/non-const mismatch.
44 
45 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
46 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
47 
48 #include "config.h"
49 #include "misc.h"
50 #include "stdcpp.h"
51 
52 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
53 # include <arm_neon.h>
54 #endif
55 
56 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
57 # include <stdint.h>
58 # include <arm_acle.h>
59 #endif
60 
61 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
62 # include <emmintrin.h>
63 # include <xmmintrin.h>
64 #endif
65 
66 // SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
67 #if (CRYPTOPP_SSSE3_AVAILABLE)
68 # include <emmintrin.h>
69 # include <pmmintrin.h>
70 # include <xmmintrin.h>
71 #endif
72 
73 #if defined(__ALTIVEC__)
74 # include "ppc_simd.h"
75 #endif
76 
77 // ************************ All block ciphers *********************** //
78 
79 ANONYMOUS_NAMESPACE_BEGIN
80 
81 using CryptoPP::BlockTransformation;
82 
83 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput)
84 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel)
85 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter)
86 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
87 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
88 
89 ANONYMOUS_NAMESPACE_END
90 
91 // *************************** ARM NEON ************************** //
92 
93 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
94 
95 NAMESPACE_BEGIN(CryptoPP)
96 
97 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
98 /// \tparam F2 function to process 2 64-bit blocks
99 /// \tparam F6 function to process 6 64-bit blocks
100 /// \tparam W word type of the subkey table
101 /// \details AdvancedProcessBlocks64_6x2_NEON processes 6 and 2 NEON SIMD words
102 /// at a time. For a single block the template uses F2 with a zero block.
103 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
104 /// same word type.
105 template <typename F2, typename F6, typename W>
106 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
107  const W *subKeys, size_t rounds, const byte *inBlocks,
108  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
109 {
110  CRYPTOPP_ASSERT(subKeys);
111  CRYPTOPP_ASSERT(inBlocks);
112  CRYPTOPP_ASSERT(outBlocks);
113  CRYPTOPP_ASSERT(length >= 8);
114 
115 #if (CRYPTOPP_LITTLE_ENDIAN)
116  const uint32x4_t s_one = {0, 0, 0, 1<<24};
117  const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
118 #else
119  // TODO: verify these constants on ARM-BE
120  const uint32x4_t s_one = {0, 0, 0, 1};
121  const uint32x4_t s_two = {0, 2, 0, 2};
122 #endif
123 
124  const size_t blockSize = 8;
125  const size_t neonBlockSize = 16;
126 
127  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
128  size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
129  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
130 
131  // Clang and Coverity are generating findings using xorBlocks as a flag.
132  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
133  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
134 
135  if (flags & BT_ReverseDirection)
136  {
137  inBlocks = PtrAdd(inBlocks, length - neonBlockSize);
138  xorBlocks = PtrAdd(xorBlocks, length - neonBlockSize);
139  outBlocks = PtrAdd(outBlocks, length - neonBlockSize);
140  inIncrement = 0-inIncrement;
141  xorIncrement = 0-xorIncrement;
142  outIncrement = 0-outIncrement;
143  }
144 
145  if (flags & BT_AllowParallel)
146  {
147  while (length >= 6*neonBlockSize)
148  {
149  uint32x4_t block0, block1, block2, block3, block4, block5;
150  if (flags & BT_InBlockIsCounter)
151  {
152  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
153  // After the dup load we have two counters in the NEON word. Then we need
154  // to increment the low ctr by 0 and the high ctr by 1.
155  const uint8x8_t ctr = vld1_u8(inBlocks);
156  block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
157 
158  // After initial increment of {0,1} remaining counters increment by {2,2}.
159  block1 = vaddq_u32(s_two, block0);
160  block2 = vaddq_u32(s_two, block1);
161  block3 = vaddq_u32(s_two, block2);
162  block4 = vaddq_u32(s_two, block3);
163  block5 = vaddq_u32(s_two, block4);
164 
165  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
166  vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
167  }
168  else
169  {
170  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
171  inBlocks = PtrAdd(inBlocks, inIncrement);
172  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
173  inBlocks = PtrAdd(inBlocks, inIncrement);
174  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
175  inBlocks = PtrAdd(inBlocks, inIncrement);
176  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
177  inBlocks = PtrAdd(inBlocks, inIncrement);
178  block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
179  inBlocks = PtrAdd(inBlocks, inIncrement);
180  block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
181  inBlocks = PtrAdd(inBlocks, inIncrement);
182  }
183 
184  if (xorInput)
185  {
186  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
187  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
188  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
189  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
190  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
191  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
192  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
193  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
194  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
195  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
196  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
197  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
198  }
199 
200  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
201 
202  if (xorOutput)
203  {
204  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
205  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
206  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
207  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
208  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
209  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
210  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
211  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
212  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
213  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
214  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
215  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
216  }
217 
218  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
219  outBlocks = PtrAdd(outBlocks, outIncrement);
220  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
221  outBlocks = PtrAdd(outBlocks, outIncrement);
222  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
223  outBlocks = PtrAdd(outBlocks, outIncrement);
224  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
225  outBlocks = PtrAdd(outBlocks, outIncrement);
226  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
227  outBlocks = PtrAdd(outBlocks, outIncrement);
228  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
229  outBlocks = PtrAdd(outBlocks, outIncrement);
230 
231  length -= 6*neonBlockSize;
232  }
233 
234  while (length >= 2*neonBlockSize)
235  {
236  uint32x4_t block0, block1;
237  if (flags & BT_InBlockIsCounter)
238  {
239  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
240  // After the dup load we have two counters in the NEON word. Then we need
241  // to increment the low ctr by 0 and the high ctr by 1.
242  const uint8x8_t ctr = vld1_u8(inBlocks);
243  block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
244 
245  // After initial increment of {0,1} remaining counters increment by {2,2}.
246  block1 = vaddq_u32(s_two, block0);
247 
248  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
249  vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
250  }
251  else
252  {
253  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
254  inBlocks = PtrAdd(inBlocks, inIncrement);
255  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
256  inBlocks = PtrAdd(inBlocks, inIncrement);
257  }
258 
259  if (xorInput)
260  {
261  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
262  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
263  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
264  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
265  }
266 
267  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
268 
269  if (xorOutput)
270  {
271  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
272  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
273  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
274  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
275  }
276 
277  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
278  outBlocks = PtrAdd(outBlocks, outIncrement);
279  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
280  outBlocks = PtrAdd(outBlocks, outIncrement);
281 
282  length -= 2*neonBlockSize;
283  }
284  }
285 
286  if (length)
287  {
288  // Adjust to real block size
289  if (flags & BT_ReverseDirection)
290  {
291  inIncrement += inIncrement ? blockSize : 0;
292  xorIncrement += xorIncrement ? blockSize : 0;
293  outIncrement += outIncrement ? blockSize : 0;
294  inBlocks = PtrSub(inBlocks, inIncrement);
295  xorBlocks = PtrSub(xorBlocks, xorIncrement);
296  outBlocks = PtrSub(outBlocks, outIncrement);
297  }
298  else
299  {
300  inIncrement -= inIncrement ? blockSize : 0;
301  xorIncrement -= xorIncrement ? blockSize : 0;
302  outIncrement -= outIncrement ? blockSize : 0;
303  }
304 
305  while (length >= blockSize)
306  {
307  uint32x4_t block, zero = {0};
308 
309  const uint8x8_t v = vld1_u8(inBlocks);
310  block = vreinterpretq_u32_u8(vcombine_u8(v,v));
311 
312  if (xorInput)
313  {
314  const uint8x8_t x = vld1_u8(xorBlocks);
315  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
316  }
317 
318  if (flags & BT_InBlockIsCounter)
319  const_cast<byte *>(inBlocks)[7]++;
320 
321  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
322 
323  if (xorOutput)
324  {
325  const uint8x8_t x = vld1_u8(xorBlocks);
326  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
327  }
328 
329  vst1_u8(const_cast<byte*>(outBlocks),
330  vget_low_u8(vreinterpretq_u8_u32(block)));
331 
332  inBlocks = PtrAdd(inBlocks, inIncrement);
333  outBlocks = PtrAdd(outBlocks, outIncrement);
334  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
335  length -= blockSize;
336  }
337  }
338 
339  return length;
340 }
341 
342 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
343 /// \tparam F1 function to process 1 128-bit block
344 /// \tparam F6 function to process 6 128-bit blocks
345 /// \tparam W word type of the subkey table
346 /// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
347 /// at a time.
348 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
349 /// same word type.
350 template <typename F1, typename F6, typename W>
351 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
352  const W *subKeys, size_t rounds, const byte *inBlocks,
353  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
354 {
355  CRYPTOPP_ASSERT(subKeys);
356  CRYPTOPP_ASSERT(inBlocks);
357  CRYPTOPP_ASSERT(outBlocks);
358  CRYPTOPP_ASSERT(length >= 16);
359 
360 #if (CRYPTOPP_LITTLE_ENDIAN)
361  const uint32x4_t s_one = {0, 0, 0, 1<<24};
362  //const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
363 #else
364  // TODO: verify these constants on ARM-BE
365  const uint32x4_t s_one = {0, 0, 0, 1};
366  //const uint32x4_t s_two = {0, 2, 0, 2};
367 #endif
368 
369  const size_t blockSize = 16;
370  // const size_t neonBlockSize = 16;
371 
372  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
373  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
374  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
375 
376  // Clang and Coverity are generating findings using xorBlocks as a flag.
377  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
378  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
379 
380  if (flags & BT_ReverseDirection)
381  {
382  inBlocks = PtrAdd(inBlocks, length - blockSize);
383  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
384  outBlocks = PtrAdd(outBlocks, length - blockSize);
385  inIncrement = 0-inIncrement;
386  xorIncrement = 0-xorIncrement;
387  outIncrement = 0-outIncrement;
388  }
389 
390  if (flags & BT_AllowParallel)
391  {
392  while (length >= 6*blockSize)
393  {
394  uint64x2_t block0, block1, block2, block3, block4, block5;
395  if (flags & BT_InBlockIsCounter)
396  {
397  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
398  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
399  block1 = vaddq_u64(block0, one);
400  block2 = vaddq_u64(block1, one);
401  block3 = vaddq_u64(block2, one);
402  block4 = vaddq_u64(block3, one);
403  block5 = vaddq_u64(block4, one);
404  vst1q_u8(const_cast<byte*>(inBlocks),
405  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
406  }
407  else
408  {
409  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
410  inBlocks = PtrAdd(inBlocks, inIncrement);
411  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
412  inBlocks = PtrAdd(inBlocks, inIncrement);
413  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
414  inBlocks = PtrAdd(inBlocks, inIncrement);
415  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
416  inBlocks = PtrAdd(inBlocks, inIncrement);
417  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
418  inBlocks = PtrAdd(inBlocks, inIncrement);
419  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
420  inBlocks = PtrAdd(inBlocks, inIncrement);
421  }
422 
423  if (xorInput)
424  {
425  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
426  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
427  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
428  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
429  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
430  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
431  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
432  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
433  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
434  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
435  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
436  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
437  }
438 
439  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
440 
441  if (xorOutput)
442  {
443  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
444  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
445  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
446  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
447  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
448  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
449  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
450  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
451  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
452  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
453  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
454  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
455  }
456 
457  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
458  outBlocks = PtrAdd(outBlocks, outIncrement);
459  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
460  outBlocks = PtrAdd(outBlocks, outIncrement);
461  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
462  outBlocks = PtrAdd(outBlocks, outIncrement);
463  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
464  outBlocks = PtrAdd(outBlocks, outIncrement);
465  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
466  outBlocks = PtrAdd(outBlocks, outIncrement);
467  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
468  outBlocks = PtrAdd(outBlocks, outIncrement);
469 
470  length -= 6*blockSize;
471  }
472  }
473 
474  while (length >= blockSize)
475  {
476  uint64x2_t block;
477  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
478 
479  if (xorInput)
480  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
481 
482  if (flags & BT_InBlockIsCounter)
483  const_cast<byte *>(inBlocks)[15]++;
484 
485  func1(block, subKeys, static_cast<unsigned int>(rounds));
486 
487  if (xorOutput)
488  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
489 
490  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
491 
492  inBlocks = PtrAdd(inBlocks, inIncrement);
493  outBlocks = PtrAdd(outBlocks, outIncrement);
494  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
495  length -= blockSize;
496  }
497 
498  return length;
499 }
500 
501 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
502 /// \tparam F1 function to process 1 128-bit block
503 /// \tparam F4 function to process 4 128-bit blocks
504 /// \tparam W word type of the subkey table
505 /// \tparam V vector type of the NEON datatype
506 /// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
507 /// at a time.
508 /// \details The subkey type is usually word32 or word64. V is the vector type and it is
509 /// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
510 /// vector type. The V parameter is used to avoid template argument
511 /// deduction/substitution failures.
512 template <typename F1, typename F4, typename W, typename V>
513 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
514  const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks,
515  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
516 {
517  CRYPTOPP_ASSERT(subKeys);
518  CRYPTOPP_ASSERT(inBlocks);
519  CRYPTOPP_ASSERT(outBlocks);
520  CRYPTOPP_ASSERT(length >= 16);
521  CRYPTOPP_UNUSED(unused);
522 
523 #if (CRYPTOPP_LITTLE_ENDIAN)
524  const uint32x4_t s_one = {0, 0, 0, 1<<24};
525  //const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
526 #else
527  // TODO: verify these constants on ARM-BE
528  const uint32x4_t s_one = {0, 0, 0, 1};
529  //const uint32x4_t s_two = {0, 2, 0, 2};
530 #endif
531 
532  const size_t blockSize = 16;
533  // const size_t neonBlockSize = 16;
534 
535  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
536  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
537  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
538 
539  // Clang and Coverity are generating findings using xorBlocks as a flag.
540  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
541  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
542 
543  if (flags & BT_ReverseDirection)
544  {
545  inBlocks = PtrAdd(inBlocks, length - blockSize);
546  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
547  outBlocks = PtrAdd(outBlocks, length - blockSize);
548  inIncrement = 0-inIncrement;
549  xorIncrement = 0-xorIncrement;
550  outIncrement = 0-outIncrement;
551  }
552 
553  if (flags & BT_AllowParallel)
554  {
555  while (length >= 4*blockSize)
556  {
557  uint64x2_t block0, block1, block2, block3;
558  if (flags & BT_InBlockIsCounter)
559  {
560  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
561  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
562  block1 = vaddq_u64(block0, one);
563  block2 = vaddq_u64(block1, one);
564  block3 = vaddq_u64(block2, one);
565  vst1q_u8(const_cast<byte*>(inBlocks),
566  vreinterpretq_u8_u64(vaddq_u64(block3, one)));
567  }
568  else
569  {
570  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
571  inBlocks = PtrAdd(inBlocks, inIncrement);
572  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
573  inBlocks = PtrAdd(inBlocks, inIncrement);
574  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
575  inBlocks = PtrAdd(inBlocks, inIncrement);
576  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
577  inBlocks = PtrAdd(inBlocks, inIncrement);
578  }
579 
580  if (xorInput)
581  {
582  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
583  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
584  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
585  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
586  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
587  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
588  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
589  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
590  }
591 
592  func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
593 
594  if (xorOutput)
595  {
596  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
597  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
598  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
599  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
600  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
601  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
602  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
603  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
604  }
605 
606  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
607  outBlocks = PtrAdd(outBlocks, outIncrement);
608  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
609  outBlocks = PtrAdd(outBlocks, outIncrement);
610  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
611  outBlocks = PtrAdd(outBlocks, outIncrement);
612  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
613  outBlocks = PtrAdd(outBlocks, outIncrement);
614 
615  length -= 4*blockSize;
616  }
617  }
618 
619  while (length >= blockSize)
620  {
621  uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
622 
623  if (xorInput)
624  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
625 
626  if (flags & BT_InBlockIsCounter)
627  const_cast<byte *>(inBlocks)[15]++;
628 
629  func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
630 
631  if (xorOutput)
632  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
633 
634  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
635 
636  inBlocks = PtrAdd(inBlocks, inIncrement);
637  outBlocks = PtrAdd(outBlocks, outIncrement);
638  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
639  length -= blockSize;
640  }
641 
642  return length;
643 }
644 
645 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
646 /// \tparam F2 function to process 2 128-bit blocks
647 /// \tparam F6 function to process 6 128-bit blocks
648 /// \tparam W word type of the subkey table
649 /// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
650 /// at a time. For a single block the template uses F2 with a zero block.
651 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
652 /// same word type.
653 template <typename F2, typename F6, typename W>
654 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
655  const W *subKeys, size_t rounds, const byte *inBlocks,
656  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
657 {
658  CRYPTOPP_ASSERT(subKeys);
659  CRYPTOPP_ASSERT(inBlocks);
660  CRYPTOPP_ASSERT(outBlocks);
661  CRYPTOPP_ASSERT(length >= 16);
662 
663 #if (CRYPTOPP_LITTLE_ENDIAN)
664  const uint32x4_t s_one = {0, 0, 0, 1<<24};
665  //const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
666 #else
667  // TODO: verify these constants on ARM-BE
668  const uint32x4_t s_one = {0, 0, 0, 1};
669  //const uint32x4_t s_two = {0, 2, 0, 2};
670 #endif
671 
672  const size_t blockSize = 16;
673  // const size_t neonBlockSize = 16;
674 
675  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
676  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
677  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
678 
679  // Clang and Coverity are generating findings using xorBlocks as a flag.
680  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
681  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
682 
683  if (flags & BT_ReverseDirection)
684  {
685  inBlocks = PtrAdd(inBlocks, length - blockSize);
686  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
687  outBlocks = PtrAdd(outBlocks, length - blockSize);
688  inIncrement = 0-inIncrement;
689  xorIncrement = 0-xorIncrement;
690  outIncrement = 0-outIncrement;
691  }
692 
693  if (flags & BT_AllowParallel)
694  {
695  while (length >= 6*blockSize)
696  {
697  uint64x2_t block0, block1, block2, block3, block4, block5;
698  if (flags & BT_InBlockIsCounter)
699  {
700  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
701  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
702  block1 = vaddq_u64(block0, one);
703  block2 = vaddq_u64(block1, one);
704  block3 = vaddq_u64(block2, one);
705  block4 = vaddq_u64(block3, one);
706  block5 = vaddq_u64(block4, one);
707  vst1q_u8(const_cast<byte*>(inBlocks),
708  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
709  }
710  else
711  {
712  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
713  inBlocks = PtrAdd(inBlocks, inIncrement);
714  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
715  inBlocks = PtrAdd(inBlocks, inIncrement);
716  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
717  inBlocks = PtrAdd(inBlocks, inIncrement);
718  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
719  inBlocks = PtrAdd(inBlocks, inIncrement);
720  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
721  inBlocks = PtrAdd(inBlocks, inIncrement);
722  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
723  inBlocks = PtrAdd(inBlocks, inIncrement);
724  }
725 
726  if (xorInput)
727  {
728  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
729  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
730  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
731  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
732  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
733  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
734  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
735  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
736  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
738  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
740  }
741 
742  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
743 
744  if (xorOutput)
745  {
746  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
747  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
748  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
749  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
750  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
751  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
752  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
753  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
754  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
755  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
756  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
757  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
758  }
759 
760  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
761  outBlocks = PtrAdd(outBlocks, outIncrement);
762  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
763  outBlocks = PtrAdd(outBlocks, outIncrement);
764  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
765  outBlocks = PtrAdd(outBlocks, outIncrement);
766  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
767  outBlocks = PtrAdd(outBlocks, outIncrement);
768  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
769  outBlocks = PtrAdd(outBlocks, outIncrement);
770  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
771  outBlocks = PtrAdd(outBlocks, outIncrement);
772 
773  length -= 6*blockSize;
774  }
775 
776  while (length >= 2*blockSize)
777  {
778  uint64x2_t block0, block1;
779  if (flags & BT_InBlockIsCounter)
780  {
781  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
782  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
783  block1 = vaddq_u64(block0, one);
784  vst1q_u8(const_cast<byte*>(inBlocks),
785  vreinterpretq_u8_u64(vaddq_u64(block1, one)));
786  }
787  else
788  {
789  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
790  inBlocks = PtrAdd(inBlocks, inIncrement);
791  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
792  inBlocks = PtrAdd(inBlocks, inIncrement);
793  }
794 
795  if (xorInput)
796  {
797  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
798  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
799  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
800  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
801  }
802 
803  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
804 
805  if (xorOutput)
806  {
807  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
808  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
809  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
810  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
811  }
812 
813  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
814  outBlocks = PtrAdd(outBlocks, outIncrement);
815  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
816  outBlocks = PtrAdd(outBlocks, outIncrement);
817 
818  length -= 2*blockSize;
819  }
820  }
821 
822  while (length >= blockSize)
823  {
824  uint64x2_t block, zero = {0,0};
825  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
826 
827  if (xorInput)
828  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
829 
830  if (flags & BT_InBlockIsCounter)
831  const_cast<byte *>(inBlocks)[15]++;
832 
833  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
834 
835  if (xorOutput)
836  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
837 
838  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
839 
840  inBlocks = PtrAdd(inBlocks, inIncrement);
841  outBlocks = PtrAdd(outBlocks, outIncrement);
842  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
843  length -= blockSize;
844  }
845 
846  return length;
847 }
848 
849 NAMESPACE_END // CryptoPP
850 
851 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
852 
853 // *************************** Intel SSE ************************** //
854 
855 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
856 
857 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
858 #if (__SUNPRO_CC >= 0x5130)
859 # define MAYBE_CONST
860 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
861 #else
862 # define MAYBE_CONST const
863 # define MAYBE_UNCONST_CAST(T, x) (x)
864 #endif
865 
866 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
867 #ifndef M128_CAST
868 # define M128_CAST(x) ((__m128i *)(void *)(x))
869 #endif
870 #ifndef CONST_M128_CAST
871 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
872 #endif
873 
874 NAMESPACE_BEGIN(CryptoPP)
875 
876 /// \brief AdvancedProcessBlocks for 1 and 2 blocks
877 /// \tparam F1 function to process 1 64-bit block
878 /// \tparam F2 function to process 2 64-bit blocks
879 /// \tparam W word type of the subkey table
880 /// \details AdvancedProcessBlocks64_2x1_SSE processes 2 and 1 SSE SIMD words
881 /// at a time.
882 /// \details The subkey type is usually word32 or word64. F1 and F2 must use the
883 /// same word type.
884 template <typename F1, typename F2, typename W>
885 inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
886  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
887  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
888 {
889  CRYPTOPP_ASSERT(subKeys);
890  CRYPTOPP_ASSERT(inBlocks);
891  CRYPTOPP_ASSERT(outBlocks);
892  CRYPTOPP_ASSERT(length >= 8);
893 
894  const size_t blockSize = 8;
895  const size_t xmmBlockSize = 16;
896 
897  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
898  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
899  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
900 
901  // Clang and Coverity are generating findings using xorBlocks as a flag.
902  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
903  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
904 
905  if (flags & BT_ReverseDirection)
906  {
907  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
908  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
909  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
910  inIncrement = 0-inIncrement;
911  xorIncrement = 0-xorIncrement;
912  outIncrement = 0-outIncrement;
913  }
914 
915  if (flags & BT_AllowParallel)
916  {
917  double temp[2];
918  while (length >= 2*xmmBlockSize)
919  {
920  __m128i block0, block1;
921  if (flags & BT_InBlockIsCounter)
922  {
923  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
924  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
925  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
926 
927  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
928  // After the dup load we have two counters in the XMM word. Then we need
929  // to increment the low ctr by 0 and the high ctr by 1.
930  std::memcpy(temp, inBlocks, blockSize);
931  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
932 
933  // After initial increment of {0,1} remaining counters increment by {2,2}.
934  block1 = _mm_add_epi32(s_two, block0);
935 
936  // Store the next counter. When BT_InBlockIsCounter is set then
937  // inBlocks is backed by m_counterArray which is non-const.
938  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
939  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
940  }
941  else
942  {
943  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
944  inBlocks = PtrAdd(inBlocks, inIncrement);
945  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
946  inBlocks = PtrAdd(inBlocks, inIncrement);
947  }
948 
949  if (xorInput)
950  {
951  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
952  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
953  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
954  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
955  }
956 
957  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
958 
959  if (xorOutput)
960  {
961  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
962  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
963  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
964  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
965  }
966 
967  _mm_storeu_si128(M128_CAST(outBlocks), block0);
968  outBlocks = PtrAdd(outBlocks, outIncrement);
969  _mm_storeu_si128(M128_CAST(outBlocks), block1);
970  outBlocks = PtrAdd(outBlocks, outIncrement);
971 
972  length -= 2*xmmBlockSize;
973  }
974  }
975 
976  if (length)
977  {
978  // Adjust to real block size
979  if (flags & BT_ReverseDirection)
980  {
981  inIncrement += inIncrement ? blockSize : 0;
982  xorIncrement += xorIncrement ? blockSize : 0;
983  outIncrement += outIncrement ? blockSize : 0;
984  inBlocks = PtrSub(inBlocks, inIncrement);
985  xorBlocks = PtrSub(xorBlocks, xorIncrement);
986  outBlocks = PtrSub(outBlocks, outIncrement);
987  }
988  else
989  {
990  inIncrement -= inIncrement ? blockSize : 0;
991  xorIncrement -= xorIncrement ? blockSize : 0;
992  outIncrement -= outIncrement ? blockSize : 0;
993  }
994 
995  while (length >= blockSize)
996  {
997  double temp[2];
998  std::memcpy(temp, inBlocks, blockSize);
999  __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1000 
1001  if (xorInput)
1002  {
1003  std::memcpy(temp, xorBlocks, blockSize);
1004  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1005  }
1006 
1007  if (flags & BT_InBlockIsCounter)
1008  const_cast<byte *>(inBlocks)[7]++;
1009 
1010  func1(block, subKeys, static_cast<unsigned int>(rounds));
1011 
1012  if (xorOutput)
1013  {
1014  std::memcpy(temp, xorBlocks, blockSize);
1015  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1016  }
1017 
1018  _mm_store_sd(temp, _mm_castsi128_pd(block));
1019  std::memcpy(outBlocks, temp, blockSize);
1020 
1021  inBlocks = PtrAdd(inBlocks, inIncrement);
1022  outBlocks = PtrAdd(outBlocks, outIncrement);
1023  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1024  length -= blockSize;
1025  }
1026  }
1027 
1028  return length;
1029 }
1030 
1031 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1032 /// \tparam F2 function to process 2 64-bit blocks
1033 /// \tparam F6 function to process 6 64-bit blocks
1034 /// \tparam W word type of the subkey table
1035 /// \details AdvancedProcessBlocks64_6x2_SSE processes 6 and 2 SSE SIMD words
1036 /// at a time. For a single block the template uses F2 with a zero block.
1037 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1038 /// same word type.
1039 template <typename F2, typename F6, typename W>
1040 inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1041  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1042  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1043 {
1044  CRYPTOPP_ASSERT(subKeys);
1045  CRYPTOPP_ASSERT(inBlocks);
1046  CRYPTOPP_ASSERT(outBlocks);
1047  CRYPTOPP_ASSERT(length >= 8);
1048 
1049  const size_t blockSize = 8;
1050  const size_t xmmBlockSize = 16;
1051 
1052  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1053  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1054  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1055 
1056  // Clang and Coverity are generating findings using xorBlocks as a flag.
1057  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1058  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1059 
1060  if (flags & BT_ReverseDirection)
1061  {
1062  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1063  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1064  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1065  inIncrement = 0-inIncrement;
1066  xorIncrement = 0-xorIncrement;
1067  outIncrement = 0-outIncrement;
1068  }
1069 
1070  if (flags & BT_AllowParallel)
1071  {
1072  double temp[2];
1073  while (length >= 6*xmmBlockSize)
1074  {
1075  __m128i block0, block1, block2, block3, block4, block5;
1076  if (flags & BT_InBlockIsCounter)
1077  {
1078  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1079  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1080  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1081 
1082  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1083  // After the dup load we have two counters in the XMM word. Then we need
1084  // to increment the low ctr by 0 and the high ctr by 1.
1085  std::memcpy(temp, inBlocks, blockSize);
1086  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1087 
1088  // After initial increment of {0,1} remaining counters increment by {2,2}.
1089  block1 = _mm_add_epi32(s_two, block0);
1090  block2 = _mm_add_epi32(s_two, block1);
1091  block3 = _mm_add_epi32(s_two, block2);
1092  block4 = _mm_add_epi32(s_two, block3);
1093  block5 = _mm_add_epi32(s_two, block4);
1094 
1095  // Store the next counter. When BT_InBlockIsCounter is set then
1096  // inBlocks is backed by m_counterArray which is non-const.
1097  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1098  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1099  }
1100  else
1101  {
1102  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1103  inBlocks = PtrAdd(inBlocks, inIncrement);
1104  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1105  inBlocks = PtrAdd(inBlocks, inIncrement);
1106  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1107  inBlocks = PtrAdd(inBlocks, inIncrement);
1108  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1109  inBlocks = PtrAdd(inBlocks, inIncrement);
1110  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1111  inBlocks = PtrAdd(inBlocks, inIncrement);
1112  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1113  inBlocks = PtrAdd(inBlocks, inIncrement);
1114  }
1115 
1116  if (xorInput)
1117  {
1118  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1119  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1120  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1121  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1122  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1123  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1124  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1125  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1126  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1128  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1130  }
1131 
1132  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1133 
1134  if (xorOutput)
1135  {
1136  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1137  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1138  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1139  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1140  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1141  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1142  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1143  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1144  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1145  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1146  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1147  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1148  }
1149 
1150  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1151  outBlocks = PtrAdd(outBlocks, outIncrement);
1152  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1153  outBlocks = PtrAdd(outBlocks, outIncrement);
1154  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1155  outBlocks = PtrAdd(outBlocks, outIncrement);
1156  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1157  outBlocks = PtrAdd(outBlocks, outIncrement);
1158  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1159  outBlocks = PtrAdd(outBlocks, outIncrement);
1160  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1161  outBlocks = PtrAdd(outBlocks, outIncrement);
1162 
1163  length -= 6*xmmBlockSize;
1164  }
1165 
1166  while (length >= 2*xmmBlockSize)
1167  {
1168  __m128i block0, block1;
1169  if (flags & BT_InBlockIsCounter)
1170  {
1171  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1172  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1173  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1174 
1175  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1176  // After the dup load we have two counters in the XMM word. Then we need
1177  // to increment the low ctr by 0 and the high ctr by 1.
1178  std::memcpy(temp, inBlocks, blockSize);
1179  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1180 
1181  // After initial increment of {0,1} remaining counters increment by {2,2}.
1182  block1 = _mm_add_epi32(s_two, block0);
1183 
1184  // Store the next counter. When BT_InBlockIsCounter is set then
1185  // inBlocks is backed by m_counterArray which is non-const.
1186  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1187  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1188  }
1189  else
1190  {
1191  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1192  inBlocks = PtrAdd(inBlocks, inIncrement);
1193  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1194  inBlocks = PtrAdd(inBlocks, inIncrement);
1195  }
1196 
1197  if (xorInput)
1198  {
1199  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1200  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1201  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1202  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1203  }
1204 
1205  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1206 
1207  if (xorOutput)
1208  {
1209  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1210  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1211  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1212  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1213  }
1214 
1215  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1216  outBlocks = PtrAdd(outBlocks, outIncrement);
1217  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1218  outBlocks = PtrAdd(outBlocks, outIncrement);
1219 
1220  length -= 2*xmmBlockSize;
1221  }
1222  }
1223 
1224  if (length)
1225  {
1226  // Adjust to real block size
1227  if (flags & BT_ReverseDirection)
1228  {
1229  inIncrement += inIncrement ? blockSize : 0;
1230  xorIncrement += xorIncrement ? blockSize : 0;
1231  outIncrement += outIncrement ? blockSize : 0;
1232  inBlocks = PtrSub(inBlocks, inIncrement);
1233  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1234  outBlocks = PtrSub(outBlocks, outIncrement);
1235  }
1236  else
1237  {
1238  inIncrement -= inIncrement ? blockSize : 0;
1239  xorIncrement -= xorIncrement ? blockSize : 0;
1240  outIncrement -= outIncrement ? blockSize : 0;
1241  }
1242 
1243  while (length >= blockSize)
1244  {
1245  double temp[2];
1246  __m128i block, zero = _mm_setzero_si128();
1247  std::memcpy(temp, inBlocks, blockSize);
1248  block = _mm_castpd_si128(_mm_load_sd(temp));
1249 
1250  if (xorInput)
1251  {
1252  std::memcpy(temp, xorBlocks, blockSize);
1253  block = _mm_xor_si128(block,
1254  _mm_castpd_si128(_mm_load_sd(temp)));
1255  }
1256 
1257  if (flags & BT_InBlockIsCounter)
1258  const_cast<byte *>(inBlocks)[7]++;
1259 
1260  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1261 
1262  if (xorOutput)
1263  {
1264  std::memcpy(temp, xorBlocks, blockSize);
1265  block = _mm_xor_si128(block,
1266  _mm_castpd_si128(_mm_load_sd(temp)));
1267  }
1268 
1269  _mm_store_sd(temp, _mm_castsi128_pd(block));
1270  std::memcpy(outBlocks, temp, blockSize);
1271 
1272  inBlocks = PtrAdd(inBlocks, inIncrement);
1273  outBlocks = PtrAdd(outBlocks, outIncrement);
1274  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1275  length -= blockSize;
1276  }
1277  }
1278 
1279  return length;
1280 }
1281 
1282 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1283 /// \tparam F2 function to process 2 128-bit blocks
1284 /// \tparam F6 function to process 6 128-bit blocks
1285 /// \tparam W word type of the subkey table
1286 /// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
1287 /// at a time. For a single block the template uses F2 with a zero block.
1288 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1289 /// same word type.
1290 template <typename F2, typename F6, typename W>
1291 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1292  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1293  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1294 {
1295  CRYPTOPP_ASSERT(subKeys);
1296  CRYPTOPP_ASSERT(inBlocks);
1297  CRYPTOPP_ASSERT(outBlocks);
1298  CRYPTOPP_ASSERT(length >= 16);
1299 
1300  const size_t blockSize = 16;
1301  // const size_t xmmBlockSize = 16;
1302 
1303  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1304  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1305  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1306 
1307  // Clang and Coverity are generating findings using xorBlocks as a flag.
1308  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1309  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1310 
1311  if (flags & BT_ReverseDirection)
1312  {
1313  inBlocks = PtrAdd(inBlocks, length - blockSize);
1314  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1315  outBlocks = PtrAdd(outBlocks, length - blockSize);
1316  inIncrement = 0-inIncrement;
1317  xorIncrement = 0-xorIncrement;
1318  outIncrement = 0-outIncrement;
1319  }
1320 
1321  if (flags & BT_AllowParallel)
1322  {
1323  while (length >= 6*blockSize)
1324  {
1325  __m128i block0, block1, block2, block3, block4, block5;
1326  if (flags & BT_InBlockIsCounter)
1327  {
1328  // Increment of 1 in big-endian compatible with the ctr byte array.
1329  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1330  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1331  block1 = _mm_add_epi32(block0, s_one);
1332  block2 = _mm_add_epi32(block1, s_one);
1333  block3 = _mm_add_epi32(block2, s_one);
1334  block4 = _mm_add_epi32(block3, s_one);
1335  block5 = _mm_add_epi32(block4, s_one);
1336  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1337  }
1338  else
1339  {
1340  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1341  inBlocks = PtrAdd(inBlocks, inIncrement);
1342  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1343  inBlocks = PtrAdd(inBlocks, inIncrement);
1344  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1345  inBlocks = PtrAdd(inBlocks, inIncrement);
1346  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1347  inBlocks = PtrAdd(inBlocks, inIncrement);
1348  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1349  inBlocks = PtrAdd(inBlocks, inIncrement);
1350  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1351  inBlocks = PtrAdd(inBlocks, inIncrement);
1352  }
1353 
1354  if (xorInput)
1355  {
1356  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1357  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1358  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1359  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1360  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1361  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1362  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1363  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1364  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1366  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1368  }
1369 
1370  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1371 
1372  if (xorOutput)
1373  {
1374  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1375  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1376  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1377  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1378  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1379  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1380  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1381  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1382  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1383  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1384  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1385  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1386  }
1387 
1388  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1389  outBlocks = PtrAdd(outBlocks, outIncrement);
1390  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1391  outBlocks = PtrAdd(outBlocks, outIncrement);
1392  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1393  outBlocks = PtrAdd(outBlocks, outIncrement);
1394  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1395  outBlocks = PtrAdd(outBlocks, outIncrement);
1396  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1397  outBlocks = PtrAdd(outBlocks, outIncrement);
1398  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1399  outBlocks = PtrAdd(outBlocks, outIncrement);
1400 
1401  length -= 6*blockSize;
1402  }
1403 
1404  while (length >= 2*blockSize)
1405  {
1406  __m128i block0, block1;
1407  if (flags & BT_InBlockIsCounter)
1408  {
1409  // Increment of 1 in big-endian compatible with the ctr byte array.
1410  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1411  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1412  block1 = _mm_add_epi32(block0, s_one);
1413  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1414  }
1415  else
1416  {
1417  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1418  inBlocks = PtrAdd(inBlocks, inIncrement);
1419  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1420  inBlocks = PtrAdd(inBlocks, inIncrement);
1421  }
1422 
1423  if (xorInput)
1424  {
1425  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1426  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1427  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1428  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1429  }
1430 
1431  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1432 
1433  if (xorOutput)
1434  {
1435  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1436  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1437  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1438  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1439  }
1440 
1441  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1442  outBlocks = PtrAdd(outBlocks, outIncrement);
1443  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1444  outBlocks = PtrAdd(outBlocks, outIncrement);
1445 
1446  length -= 2*blockSize;
1447  }
1448  }
1449 
1450  while (length >= blockSize)
1451  {
1452  __m128i block, zero = _mm_setzero_si128();
1453  block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1454 
1455  if (xorInput)
1456  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1457 
1458  if (flags & BT_InBlockIsCounter)
1459  const_cast<byte *>(inBlocks)[15]++;
1460 
1461  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1462 
1463  if (xorOutput)
1464  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1465 
1466  _mm_storeu_si128(M128_CAST(outBlocks), block);
1467 
1468  inBlocks = PtrAdd(inBlocks, inIncrement);
1469  outBlocks = PtrAdd(outBlocks, outIncrement);
1470  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1471  length -= blockSize;
1472  }
1473 
1474  return length;
1475 }
1476 
1477 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
1478 /// \tparam F1 function to process 1 128-bit block
1479 /// \tparam F4 function to process 4 128-bit blocks
1480 /// \tparam W word type of the subkey table
1481 /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
1482 /// at a time.
1483 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1484 /// same word type.
1485 template <typename F1, typename F4, typename W>
1486 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1487  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1488  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1489 {
1490  CRYPTOPP_ASSERT(subKeys);
1491  CRYPTOPP_ASSERT(inBlocks);
1492  CRYPTOPP_ASSERT(outBlocks);
1493  CRYPTOPP_ASSERT(length >= 16);
1494 
1495  const size_t blockSize = 16;
1496  // const size_t xmmBlockSize = 16;
1497 
1498  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1499  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1500  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1501 
1502  // Clang and Coverity are generating findings using xorBlocks as a flag.
1503  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1504  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1505 
1506  if (flags & BT_ReverseDirection)
1507  {
1508  inBlocks = PtrAdd(inBlocks, length - blockSize);
1509  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1510  outBlocks = PtrAdd(outBlocks, length - blockSize);
1511  inIncrement = 0-inIncrement;
1512  xorIncrement = 0-xorIncrement;
1513  outIncrement = 0-outIncrement;
1514  }
1515 
1516  if (flags & BT_AllowParallel)
1517  {
1518  while (length >= 4*blockSize)
1519  {
1520  __m128i block0, block1, block2, block3;
1521  if (flags & BT_InBlockIsCounter)
1522  {
1523  // Increment of 1 in big-endian compatible with the ctr byte array.
1524  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1525  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1526  block1 = _mm_add_epi32(block0, s_one);
1527  block2 = _mm_add_epi32(block1, s_one);
1528  block3 = _mm_add_epi32(block2, s_one);
1529  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1530  }
1531  else
1532  {
1533  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1534  inBlocks = PtrAdd(inBlocks, inIncrement);
1535  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1536  inBlocks = PtrAdd(inBlocks, inIncrement);
1537  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1538  inBlocks = PtrAdd(inBlocks, inIncrement);
1539  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1540  inBlocks = PtrAdd(inBlocks, inIncrement);
1541  }
1542 
1543  if (xorInput)
1544  {
1545  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1546  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1547  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1548  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1549  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1550  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1551  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1552  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1553  }
1554 
1555  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1556 
1557  if (xorOutput)
1558  {
1559  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1560  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1561  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1562  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1563  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1564  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1565  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1566  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1567  }
1568 
1569  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1570  outBlocks = PtrAdd(outBlocks, outIncrement);
1571  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1572  outBlocks = PtrAdd(outBlocks, outIncrement);
1573  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1574  outBlocks = PtrAdd(outBlocks, outIncrement);
1575  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1576  outBlocks = PtrAdd(outBlocks, outIncrement);
1577 
1578  length -= 4*blockSize;
1579  }
1580  }
1581 
1582  while (length >= blockSize)
1583  {
1584  __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1585 
1586  if (xorInput)
1587  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1588 
1589  if (flags & BT_InBlockIsCounter)
1590  const_cast<byte *>(inBlocks)[15]++;
1591 
1592  func1(block, subKeys, static_cast<unsigned int>(rounds));
1593 
1594  if (xorOutput)
1595  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1596 
1597  _mm_storeu_si128(M128_CAST(outBlocks), block);
1598 
1599  inBlocks = PtrAdd(inBlocks, inIncrement);
1600  outBlocks = PtrAdd(outBlocks, outIncrement);
1601  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1602  length -= blockSize;
1603  }
1604 
1605  return length;
1606 }
1607 
1608 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
1609 /// \tparam F1 function to process 1 64-bit block
1610 /// \tparam F4 function to process 6 64-bit blocks
1611 /// \tparam W word type of the subkey table
1612 /// \details AdvancedProcessBlocks64_4x1_SSE processes 4 and 1 SSE SIMD words
1613 /// at a time.
1614 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1615 /// same word type.
1616 template <typename F1, typename F4, typename W>
1617 inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1618  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1619  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1620 {
1621  CRYPTOPP_ASSERT(subKeys);
1622  CRYPTOPP_ASSERT(inBlocks);
1623  CRYPTOPP_ASSERT(outBlocks);
1624  CRYPTOPP_ASSERT(length >= 8);
1625 
1626  const size_t blockSize = 8;
1627  const size_t xmmBlockSize = 16;
1628 
1629  size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1630  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1631  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1632 
1633  // Clang and Coverity are generating findings using xorBlocks as a flag.
1634  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1635  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1636 
1637  if (flags & BT_ReverseDirection)
1638  {
1639  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1640  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1641  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1642  inIncrement = 0 - inIncrement;
1643  xorIncrement = 0 - xorIncrement;
1644  outIncrement = 0 - outIncrement;
1645  }
1646 
1647  if (flags & BT_AllowParallel)
1648  {
1649  while (length >= 4*xmmBlockSize)
1650  {
1651  __m128i block0, block1, block2, block3;
1652  if (flags & BT_InBlockIsCounter)
1653  {
1654  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1655  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1656  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1657  double temp[2];
1658 
1659  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1660  // After the dup load we have two counters in the XMM word. Then we need
1661  // to increment the low ctr by 0 and the high ctr by 1.
1662  std::memcpy(temp, inBlocks, blockSize);
1663  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1664 
1665  // After initial increment of {0,1} remaining counters increment by {2,2}.
1666  block1 = _mm_add_epi32(s_two, block0);
1667  block2 = _mm_add_epi32(s_two, block1);
1668  block3 = _mm_add_epi32(s_two, block2);
1669 
1670  // Store the next counter. When BT_InBlockIsCounter is set then
1671  // inBlocks is backed by m_counterArray which is non-const.
1672  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1673  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1674  }
1675  else
1676  {
1677  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1678  inBlocks = PtrAdd(inBlocks, inIncrement);
1679  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1680  inBlocks = PtrAdd(inBlocks, inIncrement);
1681  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1682  inBlocks = PtrAdd(inBlocks, inIncrement);
1683  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1684  inBlocks = PtrAdd(inBlocks, inIncrement);
1685  }
1686 
1687  if (xorInput)
1688  {
1689  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1690  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1691  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1692  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1693  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1694  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1695  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1696  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1697  }
1698 
1699  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1700 
1701  if (xorOutput)
1702  {
1703  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1704  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1705  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1706  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1707  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1708  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1709  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1710  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1711  }
1712 
1713  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1714  outBlocks = PtrAdd(outBlocks, outIncrement);
1715  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1716  outBlocks = PtrAdd(outBlocks, outIncrement);
1717  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1718  outBlocks = PtrAdd(outBlocks, outIncrement);
1719  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1720  outBlocks = PtrAdd(outBlocks, outIncrement);
1721 
1722  length -= 4*xmmBlockSize;
1723  }
1724  }
1725 
1726  if (length)
1727  {
1728  // Adjust to real block size
1729  if (flags & BT_ReverseDirection)
1730  {
1731  inIncrement += inIncrement ? blockSize : 0;
1732  xorIncrement += xorIncrement ? blockSize : 0;
1733  outIncrement += outIncrement ? blockSize : 0;
1734  inBlocks = PtrSub(inBlocks, inIncrement);
1735  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1736  outBlocks = PtrSub(outBlocks, outIncrement);
1737  }
1738  else
1739  {
1740  inIncrement -= inIncrement ? blockSize : 0;
1741  xorIncrement -= xorIncrement ? blockSize : 0;
1742  outIncrement -= outIncrement ? blockSize : 0;
1743  }
1744 
1745  while (length >= blockSize)
1746  {
1747  double temp[2];
1748  std::memcpy(temp, inBlocks, blockSize);
1749  __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1750 
1751  if (xorInput)
1752  {
1753  std::memcpy(temp, xorBlocks, blockSize);
1754  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1755  }
1756 
1757  if (flags & BT_InBlockIsCounter)
1758  const_cast<byte *>(inBlocks)[7]++;
1759 
1760  func1(block, subKeys, static_cast<unsigned int>(rounds));
1761 
1762  if (xorOutput)
1763  {
1764  std::memcpy(temp, xorBlocks, blockSize);
1765  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1766  }
1767 
1768  _mm_store_sd(temp, _mm_castsi128_pd(block));
1769  std::memcpy(outBlocks, temp, blockSize);
1770 
1771  inBlocks = PtrAdd(inBlocks, inIncrement);
1772  outBlocks = PtrAdd(outBlocks, outIncrement);
1773  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1774  length -= blockSize;
1775  }
1776  }
1777 
1778  return length;
1779 }
1780 
1781 NAMESPACE_END // CryptoPP
1782 
1783 #endif // CRYPTOPP_SSSE3_AVAILABLE
1784 
1785 // *********************** Altivec/Power 4 ********************** //
1786 
1787 #if defined(__ALTIVEC__)
1788 
1789 NAMESPACE_BEGIN(CryptoPP)
1790 
1791 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1792 /// \tparam F2 function to process 2 128-bit blocks
1793 /// \tparam F6 function to process 6 128-bit blocks
1794 /// \tparam W word type of the subkey table
1795 /// \details AdvancedProcessBlocks64_6x2_Altivec processes 6 and 2 Altivec SIMD words
1796 /// at a time. For a single block the template uses F2 with a zero block.
1797 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1798 /// same word type.
1799 template <typename F2, typename F6, typename W>
1800 inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1801  const W *subKeys, size_t rounds, const byte *inBlocks,
1802  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1803 {
1804  CRYPTOPP_ASSERT(subKeys);
1805  CRYPTOPP_ASSERT(inBlocks);
1806  CRYPTOPP_ASSERT(outBlocks);
1807  CRYPTOPP_ASSERT(length >= 8);
1808 
1809 #if (CRYPTOPP_LITTLE_ENDIAN)
1810  enum {LowOffset=8, HighOffset=0};
1811  const uint32x4_p s_one = {1,0,0,0};
1812  const uint32x4_p s_two = {2,0,2,0};
1813 #else
1814  enum {LowOffset=8, HighOffset=0};
1815  const uint32x4_p s_one = {0,0,0,1};
1816  const uint32x4_p s_two = {0,2,0,2};
1817 #endif
1818 
1819  const size_t blockSize = 8;
1820  const size_t vsxBlockSize = 16;
1821  CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1822 
1823  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1824  size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1825  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1826 
1827  // Clang and Coverity are generating findings using xorBlocks as a flag.
1828  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1829  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1830 
1831  if (flags & BT_ReverseDirection)
1832  {
1833  inBlocks = PtrAdd(inBlocks, length - vsxBlockSize);
1834  xorBlocks = PtrAdd(xorBlocks, length - vsxBlockSize);
1835  outBlocks = PtrAdd(outBlocks, length - vsxBlockSize);
1836  inIncrement = 0-inIncrement;
1837  xorIncrement = 0-xorIncrement;
1838  outIncrement = 0-outIncrement;
1839  }
1840 
1841  if (flags & BT_AllowParallel)
1842  {
1843  while (length >= 6*vsxBlockSize)
1844  {
1845  uint32x4_p block0, block1, block2, block3, block4, block5;
1846  if (flags & BT_InBlockIsCounter)
1847  {
1848  // There is no easy way to load 8-bytes into a vector. It is
1849  // even harder without POWER8 due to lack of 64-bit elements.
1850  std::memcpy(temp+LowOffset, inBlocks, 8);
1851  std::memcpy(temp+HighOffset, inBlocks, 8);
1852  uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1853 
1854  // For 64-bit block ciphers we need to load the CTR block,
1855  // which is 8 bytes. After the dup load we have two counters
1856  // in the Altivec word. Then we need to increment the low ctr
1857  // by 0 and the high ctr by 1.
1858  block0 = VecAdd(s_one, ctr);
1859 
1860  // After initial increment of {0,1} remaining counters
1861  // increment by {2,2}.
1862  block1 = VecAdd(s_two, block0);
1863  block2 = VecAdd(s_two, block1);
1864  block3 = VecAdd(s_two, block2);
1865  block4 = VecAdd(s_two, block3);
1866  block5 = VecAdd(s_two, block4);
1867 
1868  // Update the counter in the caller.
1869  const_cast<byte*>(inBlocks)[7] += 12;
1870  }
1871  else
1872  {
1873  block0 = VecLoadBE(inBlocks);
1874  inBlocks = PtrAdd(inBlocks, inIncrement);
1875  block1 = VecLoadBE(inBlocks);
1876  inBlocks = PtrAdd(inBlocks, inIncrement);
1877  block2 = VecLoadBE(inBlocks);
1878  inBlocks = PtrAdd(inBlocks, inIncrement);
1879  block3 = VecLoadBE(inBlocks);
1880  inBlocks = PtrAdd(inBlocks, inIncrement);
1881  block4 = VecLoadBE(inBlocks);
1882  inBlocks = PtrAdd(inBlocks, inIncrement);
1883  block5 = VecLoadBE(inBlocks);
1884  inBlocks = PtrAdd(inBlocks, inIncrement);
1885  }
1886 
1887  if (xorInput)
1888  {
1889  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1890  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1891  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1892  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1893  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1894  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1895  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1896  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1897  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1898  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1899  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1900  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1901  }
1902 
1903  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1904 
1905  if (xorOutput)
1906  {
1907  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1908  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1909  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1910  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1911  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1912  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1913  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1914  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1915  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1916  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1917  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1918  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1919  }
1920 
1921  VecStoreBE(block0, outBlocks);
1922  outBlocks = PtrAdd(outBlocks, outIncrement);
1923  VecStoreBE(block1, outBlocks);
1924  outBlocks = PtrAdd(outBlocks, outIncrement);
1925  VecStoreBE(block2, outBlocks);
1926  outBlocks = PtrAdd(outBlocks, outIncrement);
1927  VecStoreBE(block3, outBlocks);
1928  outBlocks = PtrAdd(outBlocks, outIncrement);
1929  VecStoreBE(block4, outBlocks);
1930  outBlocks = PtrAdd(outBlocks, outIncrement);
1931  VecStoreBE(block5, outBlocks);
1932  outBlocks = PtrAdd(outBlocks, outIncrement);
1933 
1934  length -= 6*vsxBlockSize;
1935  }
1936 
1937  while (length >= 2*vsxBlockSize)
1938  {
1939  uint32x4_p block0, block1;
1940  if (flags & BT_InBlockIsCounter)
1941  {
1942  // There is no easy way to load 8-bytes into a vector. It is
1943  // even harder without POWER8 due to lack of 64-bit elements.
1944  std::memcpy(temp+LowOffset, inBlocks, 8);
1945  std::memcpy(temp+HighOffset, inBlocks, 8);
1946  uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1947 
1948  // For 64-bit block ciphers we need to load the CTR block,
1949  // which is 8 bytes. After the dup load we have two counters
1950  // in the Altivec word. Then we need to increment the low ctr
1951  // by 0 and the high ctr by 1.
1952  block0 = VecAdd(s_one, ctr);
1953 
1954  // After initial increment of {0,1} remaining counters
1955  // increment by {2,2}.
1956  block1 = VecAdd(s_two, block0);
1957 
1958  // Update the counter in the caller.
1959  const_cast<byte*>(inBlocks)[7] += 4;
1960  }
1961  else
1962  {
1963  block0 = VecLoadBE(inBlocks);
1964  inBlocks = PtrAdd(inBlocks, inIncrement);
1965  block1 = VecLoadBE(inBlocks);
1966  inBlocks = PtrAdd(inBlocks, inIncrement);
1967  }
1968 
1969  if (xorInput)
1970  {
1971  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1972  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1973  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1974  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1975  }
1976 
1977  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1978 
1979  if (xorOutput)
1980  {
1981  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1982  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1983  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1984  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1985  }
1986 
1987  VecStoreBE(block0, outBlocks);
1988  outBlocks = PtrAdd(outBlocks, outIncrement);
1989  VecStoreBE(block1, outBlocks);
1990  outBlocks = PtrAdd(outBlocks, outIncrement);
1991 
1992  length -= 2*vsxBlockSize;
1993  }
1994  }
1995 
1996  if (length)
1997  {
1998  // Adjust to real block size
1999  if (flags & BT_ReverseDirection)
2000  {
2001  inIncrement += inIncrement ? blockSize : 0;
2002  xorIncrement += xorIncrement ? blockSize : 0;
2003  outIncrement += outIncrement ? blockSize : 0;
2004  inBlocks = PtrSub(inBlocks, inIncrement);
2005  xorBlocks = PtrSub(xorBlocks, xorIncrement);
2006  outBlocks = PtrSub(outBlocks, outIncrement);
2007  }
2008  else
2009  {
2010  inIncrement -= inIncrement ? blockSize : 0;
2011  xorIncrement -= xorIncrement ? blockSize : 0;
2012  outIncrement -= outIncrement ? blockSize : 0;
2013  }
2014 
2015  while (length >= blockSize)
2016  {
2017  uint32x4_p block, zero = {0};
2018 
2019  // There is no easy way to load 8-bytes into a vector. It is
2020  // even harder without POWER8 due to lack of 64-bit elements.
2021  // The high 8 bytes are "don't care" but it if we don't
2022  // initialize the block then it generates warnings.
2023  std::memcpy(temp+LowOffset, inBlocks, 8);
2024  std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
2025  block = (uint32x4_p)VecLoadBE(temp);
2026 
2027  if (xorInput)
2028  {
2029  std::memcpy(temp+LowOffset, xorBlocks, 8);
2030  std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2031  uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2032  block = VecXor(block, x);
2033  }
2034 
2035  // Update the counter in the caller.
2036  if (flags & BT_InBlockIsCounter)
2037  const_cast<byte *>(inBlocks)[7]++;
2038 
2039  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2040 
2041  if (xorOutput)
2042  {
2043  std::memcpy(temp+LowOffset, xorBlocks, 8);
2044  std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2045  uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2046  block = VecXor(block, x);
2047  }
2048 
2049  VecStoreBE(block, temp);
2050  std::memcpy(outBlocks, temp+LowOffset, 8);
2051 
2052  inBlocks = PtrAdd(inBlocks, inIncrement);
2053  outBlocks = PtrAdd(outBlocks, outIncrement);
2054  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2055  length -= blockSize;
2056  }
2057  }
2058 
2059  return length;
2060 }
2061 
2062 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
2063 /// \tparam F1 function to process 1 128-bit block
2064 /// \tparam F4 function to process 4 128-bit blocks
2065 /// \tparam W word type of the subkey table
2066 /// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
2067 /// at a time.
2068 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
2069 /// same word type.
2070 template <typename F1, typename F4, typename W>
2071 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2072  const W *subKeys, size_t rounds, const byte *inBlocks,
2073  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2074 {
2075  CRYPTOPP_ASSERT(subKeys);
2076  CRYPTOPP_ASSERT(inBlocks);
2077  CRYPTOPP_ASSERT(outBlocks);
2078  CRYPTOPP_ASSERT(length >= 16);
2079 
2080 #if (CRYPTOPP_LITTLE_ENDIAN)
2081  const uint32x4_p s_one = {1,0,0,0};
2082 #else
2083  const uint32x4_p s_one = {0,0,0,1};
2084 #endif
2085 
2086  const size_t blockSize = 16;
2087  // const size_t vsxBlockSize = 16;
2088 
2089  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2090  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2091  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2092 
2093  // Clang and Coverity are generating findings using xorBlocks as a flag.
2094  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2095  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2096 
2097  if (flags & BT_ReverseDirection)
2098  {
2099  inBlocks = PtrAdd(inBlocks, length - blockSize);
2100  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2101  outBlocks = PtrAdd(outBlocks, length - blockSize);
2102  inIncrement = 0-inIncrement;
2103  xorIncrement = 0-xorIncrement;
2104  outIncrement = 0-outIncrement;
2105  }
2106 
2107  if (flags & BT_AllowParallel)
2108  {
2109  while (length >= 4*blockSize)
2110  {
2111  uint32x4_p block0, block1, block2, block3;
2112 
2113  if (flags & BT_InBlockIsCounter)
2114  {
2115  block0 = VecLoadBE(inBlocks);
2116  block1 = VecAdd(block0, s_one);
2117  block2 = VecAdd(block1, s_one);
2118  block3 = VecAdd(block2, s_one);
2119 
2120  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2121  // CTR_ModePolicy::OperateKeystream is wired such that after
2122  // returning from this function CTR_ModePolicy will detect wrap on
2123  // on the last counter byte and increment the next to last byte.
2124  // The problem is, with a big-endian load, inBlocks[15] is really
2125  // located at index 15. The vector addition using a 32-bit element
2126  // generates a carry into inBlocks[14] and then CTR_ModePolicy
2127  // increments inBlocks[14] too.
2128  const_cast<byte*>(inBlocks)[15] += 6;
2129  }
2130  else
2131  {
2132  block0 = VecLoadBE(inBlocks);
2133  inBlocks = PtrAdd(inBlocks, inIncrement);
2134  block1 = VecLoadBE(inBlocks);
2135  inBlocks = PtrAdd(inBlocks, inIncrement);
2136  block2 = VecLoadBE(inBlocks);
2137  inBlocks = PtrAdd(inBlocks, inIncrement);
2138  block3 = VecLoadBE(inBlocks);
2139  inBlocks = PtrAdd(inBlocks, inIncrement);
2140  }
2141 
2142  if (xorInput)
2143  {
2144  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2145  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2146  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2147  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2148  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2149  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2150  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2151  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2152  }
2153 
2154  func4(block0, block1, block2, block3, subKeys, rounds);
2155 
2156  if (xorOutput)
2157  {
2158  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2159  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2160  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2161  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2162  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2163  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2164  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2165  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2166  }
2167 
2168  VecStoreBE(block0, outBlocks);
2169  outBlocks = PtrAdd(outBlocks, outIncrement);
2170  VecStoreBE(block1, outBlocks);
2171  outBlocks = PtrAdd(outBlocks, outIncrement);
2172  VecStoreBE(block2, outBlocks);
2173  outBlocks = PtrAdd(outBlocks, outIncrement);
2174  VecStoreBE(block3, outBlocks);
2175  outBlocks = PtrAdd(outBlocks, outIncrement);
2176 
2177  length -= 4*blockSize;
2178  }
2179  }
2180 
2181  while (length >= blockSize)
2182  {
2183  uint32x4_p block = VecLoadBE(inBlocks);
2184 
2185  if (xorInput)
2186  block = VecXor(block, VecLoadBE(xorBlocks));
2187 
2188  if (flags & BT_InBlockIsCounter)
2189  const_cast<byte *>(inBlocks)[15]++;
2190 
2191  func1(block, subKeys, rounds);
2192 
2193  if (xorOutput)
2194  block = VecXor(block, VecLoadBE(xorBlocks));
2195 
2196  VecStoreBE(block, outBlocks);
2197 
2198  inBlocks = PtrAdd(inBlocks, inIncrement);
2199  outBlocks = PtrAdd(outBlocks, outIncrement);
2200  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2201  length -= blockSize;
2202  }
2203 
2204  return length;
2205 }
2206 
2207 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
2208 /// \tparam F1 function to process 1 128-bit block
2209 /// \tparam F6 function to process 6 128-bit blocks
2210 /// \tparam W word type of the subkey table
2211 /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
2212 /// at a time.
2213 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
2214 /// same word type.
2215 template <typename F1, typename F6, typename W>
2216 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2217  const W *subKeys, size_t rounds, const byte *inBlocks,
2218  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2219 {
2220  CRYPTOPP_ASSERT(subKeys);
2221  CRYPTOPP_ASSERT(inBlocks);
2222  CRYPTOPP_ASSERT(outBlocks);
2223  CRYPTOPP_ASSERT(length >= 16);
2224 
2225 #if (CRYPTOPP_LITTLE_ENDIAN)
2226  const uint32x4_p s_one = {1,0,0,0};
2227 #else
2228  const uint32x4_p s_one = {0,0,0,1};
2229 #endif
2230 
2231  const size_t blockSize = 16;
2232  // const size_t vsxBlockSize = 16;
2233 
2234  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2235  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2236  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2237 
2238  // Clang and Coverity are generating findings using xorBlocks as a flag.
2239  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2240  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2241 
2242  if (flags & BT_ReverseDirection)
2243  {
2244  inBlocks = PtrAdd(inBlocks, length - blockSize);
2245  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2246  outBlocks = PtrAdd(outBlocks, length - blockSize);
2247  inIncrement = 0-inIncrement;
2248  xorIncrement = 0-xorIncrement;
2249  outIncrement = 0-outIncrement;
2250  }
2251 
2252  if (flags & BT_AllowParallel)
2253  {
2254  while (length >= 6*blockSize)
2255  {
2256  uint32x4_p block0, block1, block2, block3, block4, block5;
2257 
2258  if (flags & BT_InBlockIsCounter)
2259  {
2260  block0 = VecLoadBE(inBlocks);
2261  block1 = VecAdd(block0, s_one);
2262  block2 = VecAdd(block1, s_one);
2263  block3 = VecAdd(block2, s_one);
2264  block4 = VecAdd(block3, s_one);
2265  block5 = VecAdd(block4, s_one);
2266 
2267  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2268  // CTR_ModePolicy::OperateKeystream is wired such that after
2269  // returning from this function CTR_ModePolicy will detect wrap on
2270  // on the last counter byte and increment the next to last byte.
2271  // The problem is, with a big-endian load, inBlocks[15] is really
2272  // located at index 15. The vector addition using a 32-bit element
2273  // generates a carry into inBlocks[14] and then CTR_ModePolicy
2274  // increments inBlocks[14] too.
2275  //
2276  // To find this bug we needed a test case with a ctr of 0xNN...FA.
2277  // The last octet is 0xFA and adding 6 creates the wrap to trigger
2278  // the issue. If the last octet was 0xFC then 4 would trigger it.
2279  // We dumb-lucked into the test with SPECK-128. The test case of
2280  // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
2281  uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
2282  VecStoreBE(temp, const_cast<byte*>(inBlocks));
2283  }
2284  else
2285  {
2286  block0 = VecLoadBE(inBlocks);
2287  inBlocks = PtrAdd(inBlocks, inIncrement);
2288  block1 = VecLoadBE(inBlocks);
2289  inBlocks = PtrAdd(inBlocks, inIncrement);
2290  block2 = VecLoadBE(inBlocks);
2291  inBlocks = PtrAdd(inBlocks, inIncrement);
2292  block3 = VecLoadBE(inBlocks);
2293  inBlocks = PtrAdd(inBlocks, inIncrement);
2294  block4 = VecLoadBE(inBlocks);
2295  inBlocks = PtrAdd(inBlocks, inIncrement);
2296  block5 = VecLoadBE(inBlocks);
2297  inBlocks = PtrAdd(inBlocks, inIncrement);
2298  }
2299 
2300  if (xorInput)
2301  {
2302  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2303  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2304  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2305  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2306  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2307  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2308  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2309  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2310  block4 = VecXor(block4, VecLoadBE(xorBlocks));
2311  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2312  block5 = VecXor(block5, VecLoadBE(xorBlocks));
2313  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2314  }
2315 
2316  func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2317 
2318  if (xorOutput)
2319  {
2320  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2321  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2322  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2323  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2324  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2325  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2326  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2327  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2328  block4 = VecXor(block4, VecLoadBE(xorBlocks));
2329  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2330  block5 = VecXor(block5, VecLoadBE(xorBlocks));
2331  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2332  }
2333 
2334  VecStoreBE(block0, outBlocks);
2335  outBlocks = PtrAdd(outBlocks, outIncrement);
2336  VecStoreBE(block1, outBlocks);
2337  outBlocks = PtrAdd(outBlocks, outIncrement);
2338  VecStoreBE(block2, outBlocks);
2339  outBlocks = PtrAdd(outBlocks, outIncrement);
2340  VecStoreBE(block3, outBlocks);
2341  outBlocks = PtrAdd(outBlocks, outIncrement);
2342  VecStoreBE(block4, outBlocks);
2343  outBlocks = PtrAdd(outBlocks, outIncrement);
2344  VecStoreBE(block5, outBlocks);
2345  outBlocks = PtrAdd(outBlocks, outIncrement);
2346 
2347  length -= 6*blockSize;
2348  }
2349  }
2350 
2351  while (length >= blockSize)
2352  {
2353  uint32x4_p block = VecLoadBE(inBlocks);
2354 
2355  if (xorInput)
2356  block = VecXor(block, VecLoadBE(xorBlocks));
2357 
2358  if (flags & BT_InBlockIsCounter)
2359  const_cast<byte *>(inBlocks)[15]++;
2360 
2361  func1(block, subKeys, rounds);
2362 
2363  if (xorOutput)
2364  block = VecXor(block, VecLoadBE(xorBlocks));
2365 
2366  VecStoreBE(block, outBlocks);
2367 
2368  inBlocks = PtrAdd(inBlocks, inIncrement);
2369  outBlocks = PtrAdd(outBlocks, outIncrement);
2370  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2371  length -= blockSize;
2372  }
2373 
2374  return length;
2375 }
2376 
2377 NAMESPACE_END // CryptoPP
2378 
2379 #endif // __ALTIVEC__
2380 
2381 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Allow parallel transformations.
Definition: cryptlib.h:897
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:891
Common C++ header files.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:746
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:384
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:371
Xor inputs before transformation.
Definition: cryptlib.h:893
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:438
perform the transformation in reverse
Definition: cryptlib.h:895
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118