Crypto++  8.0
Free C++ class library of cryptographic schemes
cham_simd.cpp
1 // cham_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "cham.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both simon.cpp and simon-simd.cpp.
17 // #undef CRYPTOPP_SSSE3_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if defined(CRYPTOPP_SSE2_AVAILABLE)
21 # define CRYPTOPP_AVX512_ROTATE 1
22 # include <xmmintrin.h>
23 # include <emmintrin.h>
24 #endif
25 
26 #if (CRYPTOPP_SSSE3_AVAILABLE)
27 # include <pmmintrin.h>
28 # include <tmmintrin.h>
29 #endif
30 
31 #if defined(__XOP__)
32 # include <ammintrin.h>
33 #endif
34 
35 // Squash MS LNK4221 and libtool warnings
36 extern const char CHAM_SIMD_FNAME[] = __FILE__;
37 
38 ANONYMOUS_NAMESPACE_BEGIN
39 
40 using CryptoPP::word16;
41 using CryptoPP::word32;
42 
43 #if (CRYPTOPP_SSSE3_AVAILABLE)
44 
45 //////////////////////////////////////////////////////////////////////////
46 
47 NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size
48 
49 template <unsigned int R>
50 inline __m128i RotateLeft16(const __m128i& val)
51 {
52 #if defined(__XOP__)
53  return _mm_roti_epi16(val, R);
54 #else
55  return _mm_or_si128(
56  _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
57 #endif
58 }
59 
60 template <unsigned int R>
61 inline __m128i RotateRight16(const __m128i& val)
62 {
63 #if defined(__XOP__)
64  return _mm_roti_epi16(val, 16-R);
65 #else
66  return _mm_or_si128(
67  _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
68 #endif
69 }
70 
71 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
72 template <>
73 inline __m128i RotateLeft16<8>(const __m128i& val)
74 {
75 #if defined(__XOP__)
76  return _mm_roti_epi16(val, 8);
77 #else
78  const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
79  return _mm_shuffle_epi8(val, mask);
80 #endif
81 }
82 
83 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
84 template <>
85 inline __m128i RotateRight16<8>(const __m128i& val)
86 {
87 #if defined(__XOP__)
88  return _mm_roti_epi16(val, 16-8);
89 #else
90  const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
91  return _mm_shuffle_epi8(val, mask);
92 #endif
93 }
94 
95 template <unsigned int IDX>
96 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
97  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
98 {
99  // Should not be instantiated
100  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
101  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
102  CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
103  CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
104  CRYPTOPP_ASSERT(0);
105  return _mm_setzero_si128();
106 }
107 
108 template <>
109 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
110  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
111 {
112  // The shuffle converts to and from little-endian for SSE. A specialized
113  // CHAM implementation can avoid the shuffle by framing the data for
114  // encryption, decryption and benchmarks. The library cannot take the
115  // speed-up because of the byte oriented API.
116  const __m128i r1 = _mm_unpacklo_epi16(a, b);
117  const __m128i r2 = _mm_unpacklo_epi16(c, d);
118  const __m128i r3 = _mm_unpacklo_epi16(e, f);
119  const __m128i r4 = _mm_unpacklo_epi16(g, h);
120 
121  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
122  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
123  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
124  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
125 }
126 
127 template <>
128 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
129  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
130 {
131  // The shuffle converts to and from little-endian for SSE. A specialized
132  // CHAM implementation can avoid the shuffle by framing the data for
133  // encryption, decryption and benchmarks. The library cannot take the
134  // speed-up because of the byte oriented API.
135  const __m128i r1 = _mm_unpacklo_epi16(a, b);
136  const __m128i r2 = _mm_unpacklo_epi16(c, d);
137  const __m128i r3 = _mm_unpacklo_epi16(e, f);
138  const __m128i r4 = _mm_unpacklo_epi16(g, h);
139 
140  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
141  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
142  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
143  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
144 }
145 
146 template <>
147 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
148  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
149 {
150  // The shuffle converts to and from little-endian for SSE. A specialized
151  // CHAM implementation can avoid the shuffle by framing the data for
152  // encryption, decryption and benchmarks. The library cannot take the
153  // speed-up because of the byte oriented API.
154  const __m128i r1 = _mm_unpacklo_epi16(a, b);
155  const __m128i r2 = _mm_unpacklo_epi16(c, d);
156  const __m128i r3 = _mm_unpacklo_epi16(e, f);
157  const __m128i r4 = _mm_unpacklo_epi16(g, h);
158 
159  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
160  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
161  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
162  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
163 }
164 
165 template <>
166 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
167  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
168 {
169  // The shuffle converts to and from little-endian for SSE. A specialized
170  // CHAM implementation can avoid the shuffle by framing the data for
171  // encryption, decryption and benchmarks. The library cannot take the
172  // speed-up because of the byte oriented API.
173  const __m128i r1 = _mm_unpacklo_epi16(a, b);
174  const __m128i r2 = _mm_unpacklo_epi16(c, d);
175  const __m128i r3 = _mm_unpacklo_epi16(e, f);
176  const __m128i r4 = _mm_unpacklo_epi16(g, h);
177 
178  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
179  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
180  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
181  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
182 }
183 
184 template <>
185 inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
186  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
187 {
188  // The shuffle converts to and from little-endian for SSE. A specialized
189  // CHAM implementation can avoid the shuffle by framing the data for
190  // encryption, decryption and benchmarks. The library cannot take the
191  // speed-up because of the byte oriented API.
192  const __m128i r1 = _mm_unpackhi_epi16(a, b);
193  const __m128i r2 = _mm_unpackhi_epi16(c, d);
194  const __m128i r3 = _mm_unpackhi_epi16(e, f);
195  const __m128i r4 = _mm_unpackhi_epi16(g, h);
196 
197  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
198  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
199  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
200  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
201 }
202 
203 template <>
204 inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
205  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
206 {
207  // The shuffle converts to and from little-endian for SSE. A specialized
208  // CHAM implementation can avoid the shuffle by framing the data for
209  // encryption, decryption and benchmarks. The library cannot take the
210  // speed-up because of the byte oriented API.
211  const __m128i r1 = _mm_unpackhi_epi16(a, b);
212  const __m128i r2 = _mm_unpackhi_epi16(c, d);
213  const __m128i r3 = _mm_unpackhi_epi16(e, f);
214  const __m128i r4 = _mm_unpackhi_epi16(g, h);
215 
216  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
217  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
218  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
219  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
220 }
221 
222 template <>
223 inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
224  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
225 {
226  // The shuffle converts to and from little-endian for SSE. A specialized
227  // CHAM implementation can avoid the shuffle by framing the data for
228  // encryption, decryption and benchmarks. The library cannot take the
229  // speed-up because of the byte oriented API.
230  const __m128i r1 = _mm_unpackhi_epi16(a, b);
231  const __m128i r2 = _mm_unpackhi_epi16(c, d);
232  const __m128i r3 = _mm_unpackhi_epi16(e, f);
233  const __m128i r4 = _mm_unpackhi_epi16(g, h);
234 
235  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
236  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
237  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
238  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
239 }
240 
241 template <>
242 inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
243  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
244 {
245  // The shuffle converts to and from little-endian for SSE. A specialized
246  // CHAM implementation can avoid the shuffle by framing the data for
247  // encryption, decryption and benchmarks. The library cannot take the
248  // speed-up because of the byte oriented API.
249  const __m128i r1 = _mm_unpackhi_epi16(a, b);
250  const __m128i r2 = _mm_unpackhi_epi16(c, d);
251  const __m128i r3 = _mm_unpackhi_epi16(e, f);
252  const __m128i r4 = _mm_unpackhi_epi16(g, h);
253 
254  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
255  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
256  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
257  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
258 }
259 
260 template <unsigned int IDX>
261 inline __m128i UnpackXMM(const __m128i& v)
262 {
263  // Should not be instantiated
264  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
265 
266  return _mm_setzero_si128();
267 }
268 
269 template <>
270 inline __m128i UnpackXMM<0>(const __m128i& v)
271 {
272  return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
273 }
274 
275 template <>
276 inline __m128i UnpackXMM<1>(const __m128i& v)
277 {
278  return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
279 }
280 
281 template <>
282 inline __m128i UnpackXMM<2>(const __m128i& v)
283 {
284  return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
285 }
286 
287 template <>
288 inline __m128i UnpackXMM<3>(const __m128i& v)
289 {
290  return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
291 }
292 
293 template <>
294 inline __m128i UnpackXMM<4>(const __m128i& v)
295 {
296  return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
297 }
298 
299 template <>
300 inline __m128i UnpackXMM<5>(const __m128i& v)
301 {
302  return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
303 }
304 
305 template <>
306 inline __m128i UnpackXMM<6>(const __m128i& v)
307 {
308  return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
309 }
310 
311 template <>
312 inline __m128i UnpackXMM<7>(const __m128i& v)
313 {
314  return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
315 }
316 
317 template <unsigned int IDX>
318 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
319 {
320  const __m128i& z = _mm_setzero_si128();
321  return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
322 }
323 
324 template <unsigned int IDX>
325 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
326  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
327 {
328  return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
329 }
330 
331 template <unsigned int IDX>
332 inline __m128i RepackXMM(const __m128i& v)
333 {
334  return UnpackXMM<IDX>(v);
335 }
336 
337 inline void CHAM64_Enc_Block(__m128i &block0,
338  const word16 *subkeys, unsigned int /*rounds*/)
339 {
340  // Rearrange the data for vectorization. UnpackXMM includes a
341  // little-endian swap for SSE. Thanks to Peter Cordes for help
342  // with packing and unpacking.
343  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
344  __m128i a = UnpackXMM<0>(block0);
345  __m128i b = UnpackXMM<1>(block0);
346  __m128i c = UnpackXMM<2>(block0);
347  __m128i d = UnpackXMM<3>(block0);
348  __m128i e = UnpackXMM<4>(block0);
349  __m128i f = UnpackXMM<5>(block0);
350  __m128i g = UnpackXMM<6>(block0);
351  __m128i h = UnpackXMM<7>(block0);
352 
353  const unsigned int rounds = 80;
354  __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
355  __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
356 
357  const unsigned int MASK = 15;
358  for (int i=0; i<static_cast<int>(rounds); i+=4)
359  {
360  __m128i k, kr, t1, t2, t3, t4;
361  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
362 
363  // Shuffle out key
364  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
365 
366  t1 = _mm_xor_si128(a, counter);
367  t3 = _mm_xor_si128(e, counter);
368  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
369  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
370  a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
371  e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
372 
373  counter = _mm_add_epi16(counter, increment);
374  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
375 
376  t1 = _mm_xor_si128(b, counter);
377  t3 = _mm_xor_si128(f, counter);
378  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
379  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
380  b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
381  f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
382 
383  counter = _mm_add_epi16(counter, increment);
384  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
385 
386  t1 = _mm_xor_si128(c, counter);
387  t3 = _mm_xor_si128(g, counter);
388  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
389  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
390  c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
391  g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
392 
393  counter = _mm_add_epi16(counter, increment);
394  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
395 
396  t1 = _mm_xor_si128(d, counter);
397  t3 = _mm_xor_si128(h, counter);
398  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
399  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
400  d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
401  h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
402 
403  counter = _mm_add_epi16(counter, increment);
404  }
405 
406  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
407  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
408 }
409 
410 inline void CHAM64_Dec_Block(__m128i &block0,
411  const word16 *subkeys, unsigned int /*rounds*/)
412 {
413  // Rearrange the data for vectorization. UnpackXMM includes a
414  // little-endian swap for SSE. Thanks to Peter Cordes for help
415  // with packing and unpacking.
416  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
417  __m128i a = UnpackXMM<0>(block0);
418  __m128i b = UnpackXMM<1>(block0);
419  __m128i c = UnpackXMM<2>(block0);
420  __m128i d = UnpackXMM<3>(block0);
421  __m128i e = UnpackXMM<4>(block0);
422  __m128i f = UnpackXMM<5>(block0);
423  __m128i g = UnpackXMM<6>(block0);
424  __m128i h = UnpackXMM<7>(block0);
425 
426  const unsigned int rounds = 80;
427  __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
428  __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
429 
430  const unsigned int MASK = 15;
431  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
432  {
433  __m128i k, kr, t1, t2, t3, t4;
434  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
435 
436  // Shuffle out key
437  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
438 
439  // Odd round
440  t1 = RotateRight16<1>(d);
441  t3 = RotateRight16<1>(h);
442  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
443  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
444  d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
445  h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
446 
447  counter = _mm_sub_epi16(counter, decrement);
448  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
449 
450  // Even round
451  t1 = RotateRight16<8>(c);
452  t3 = RotateRight16<8>(g);
453  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
454  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
455  c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
456  g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
457 
458  counter = _mm_sub_epi16(counter, decrement);
459  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
460 
461  // Odd round
462  t1 = RotateRight16<1>(b);
463  t3 = RotateRight16<1>(f);
464  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
465  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
466  b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
467  f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
468 
469  counter = _mm_sub_epi16(counter, decrement);
470  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
471 
472  // Even round
473  t1 = RotateRight16<8>(a);
474  t3 = RotateRight16<8>(e);
475  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
476  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
477  a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
478  e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
479 
480  counter = _mm_sub_epi16(counter, decrement);
481  }
482 
483  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
484  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
485 }
486 
487 inline void CHAM64_Enc_2_Blocks(__m128i &block0,
488  __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
489 {
490  // Rearrange the data for vectorization. UnpackXMM includes a
491  // little-endian swap for SSE. Thanks to Peter Cordes for help
492  // with packing and unpacking.
493  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
494  __m128i a = UnpackXMM<0>(block0, block1);
495  __m128i b = UnpackXMM<1>(block0, block1);
496  __m128i c = UnpackXMM<2>(block0, block1);
497  __m128i d = UnpackXMM<3>(block0, block1);
498  __m128i e = UnpackXMM<4>(block0, block1);
499  __m128i f = UnpackXMM<5>(block0, block1);
500  __m128i g = UnpackXMM<6>(block0, block1);
501  __m128i h = UnpackXMM<7>(block0, block1);
502 
503  const unsigned int rounds = 80;
504  __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
505  __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
506 
507  const unsigned int MASK = 15;
508  for (int i=0; i<static_cast<int>(rounds); i+=4)
509  {
510  __m128i k, kr, t1, t2, t3, t4;
511  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[i & MASK])));
512 
513  // Shuffle out key
514  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
515 
516  t1 = _mm_xor_si128(a, counter);
517  t3 = _mm_xor_si128(e, counter);
518  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
519  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
520  a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
521  e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
522 
523  counter = _mm_add_epi16(counter, increment);
524  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
525 
526  t1 = _mm_xor_si128(b, counter);
527  t3 = _mm_xor_si128(f, counter);
528  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
529  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
530  b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
531  f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
532 
533  counter = _mm_add_epi16(counter, increment);
534  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
535 
536  t1 = _mm_xor_si128(c, counter);
537  t3 = _mm_xor_si128(g, counter);
538  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
539  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
540  c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
541  g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
542 
543  counter = _mm_add_epi16(counter, increment);
544  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
545 
546  t1 = _mm_xor_si128(d, counter);
547  t3 = _mm_xor_si128(h, counter);
548  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
549  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
550  d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
551  h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
552 
553  counter = _mm_add_epi16(counter, increment);
554  }
555 
556  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
557  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
558  block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
559 }
560 
561 inline void CHAM64_Dec_2_Blocks(__m128i &block0,
562  __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
563 {
564  // Rearrange the data for vectorization. UnpackXMM includes a
565  // little-endian swap for SSE. Thanks to Peter Cordes for help
566  // with packing and unpacking.
567  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
568  __m128i a = UnpackXMM<0>(block0, block1);
569  __m128i b = UnpackXMM<1>(block0, block1);
570  __m128i c = UnpackXMM<2>(block0, block1);
571  __m128i d = UnpackXMM<3>(block0, block1);
572  __m128i e = UnpackXMM<4>(block0, block1);
573  __m128i f = UnpackXMM<5>(block0, block1);
574  __m128i g = UnpackXMM<6>(block0, block1);
575  __m128i h = UnpackXMM<7>(block0, block1);
576 
577  const unsigned int rounds = 80;
578  __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
579  __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
580 
581  const unsigned int MASK = 15;
582  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
583  {
584  __m128i k, kr, t1, t2, t3, t4;
585  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
586 
587  // Shuffle out key
588  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
589 
590  // Odd round
591  t1 = RotateRight16<1>(d);
592  t3 = RotateRight16<1>(h);
593  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
594  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
595  d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
596  h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
597 
598  counter = _mm_sub_epi16(counter, decrement);
599  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
600 
601  // Even round
602  t1 = RotateRight16<8>(c);
603  t3 = RotateRight16<8>(g);
604  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
605  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
606  c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
607  g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
608 
609  counter = _mm_sub_epi16(counter, decrement);
610  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
611 
612  // Odd round
613  t1 = RotateRight16<1>(b);
614  t3 = RotateRight16<1>(f);
615  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
616  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
617  b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
618  f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
619 
620  counter = _mm_sub_epi16(counter, decrement);
621  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
622 
623  // Even round
624  t1 = RotateRight16<8>(a);
625  t3 = RotateRight16<8>(e);
626  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
627  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
628  a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
629  e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
630 
631  counter = _mm_sub_epi16(counter, decrement);
632  }
633 
634  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
635  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
636  block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
637 }
638 
639 NAMESPACE_END // W16
640 
641 //////////////////////////////////////////////////////////////////////////
642 
643 NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size
644 
645 template <unsigned int R>
646 inline __m128i RotateLeft32(const __m128i& val)
647 {
648 #if defined(CRYPTOPP_AVX512_ROTATE)
649  return _mm_rol_epi32(val, R);
650 #elif defined(__XOP__)
651  return _mm_roti_epi32(val, R);
652 #else
653  return _mm_or_si128(
654  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
655 #endif
656 }
657 
658 template <unsigned int R>
659 inline __m128i RotateRight32(const __m128i& val)
660 {
661 #if defined(CRYPTOPP_AVX512_ROTATE)
662  return _mm_ror_epi32(val, R);
663 #elif defined(__XOP__)
664  return _mm_roti_epi32(val, 32-R);
665 #else
666  return _mm_or_si128(
667  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
668 #endif
669 }
670 
671 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
672 template <>
673 inline __m128i RotateLeft32<8>(const __m128i& val)
674 {
675 #if defined(__XOP__)
676  return _mm_roti_epi32(val, 8);
677 #else
678  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
679  return _mm_shuffle_epi8(val, mask);
680 #endif
681 }
682 
683 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
684 template <>
685 inline __m128i RotateRight32<8>(const __m128i& val)
686 {
687 #if defined(__XOP__)
688  return _mm_roti_epi32(val, 32-8);
689 #else
690  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
691  return _mm_shuffle_epi8(val, mask);
692 #endif
693 }
694 
695 template <unsigned int IDX>
696 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
697 {
698  // Should not be instantiated
699  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
700  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
701  CRYPTOPP_ASSERT(0);
702  return _mm_setzero_si128();
703 }
704 
705 template <>
706 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
707 {
708  // The shuffle converts to and from little-endian for SSE. A specialized
709  // CHAM implementation can avoid the shuffle by framing the data for
710  // encryption, decryption and benchmarks. The library cannot take the
711  // speed-up because of the byte oriented API.
712  const __m128i r1 = _mm_unpacklo_epi32(a, b);
713  const __m128i r2 = _mm_unpacklo_epi32(c, d);
714  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
715  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
716 }
717 
718 template <>
719 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
720 {
721  // The shuffle converts to and from little-endian for SSE. A specialized
722  // CHAM implementation can avoid the shuffle by framing the data for
723  // encryption, decryption and benchmarks. The library cannot take the
724  // speed-up because of the byte oriented API.
725  const __m128i r1 = _mm_unpacklo_epi32(a, b);
726  const __m128i r2 = _mm_unpacklo_epi32(c, d);
727  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
728  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
729 }
730 
731 template <>
732 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
733 {
734  // The shuffle converts to and from little-endian for SSE. A specialized
735  // CHAM implementation can avoid the shuffle by framing the data for
736  // encryption, decryption and benchmarks. The library cannot take the
737  // speed-up because of the byte oriented API.
738  const __m128i r1 = _mm_unpackhi_epi32(a, b);
739  const __m128i r2 = _mm_unpackhi_epi32(c, d);
740  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
741  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
742 }
743 
744 template <>
745 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
746 {
747  // The shuffle converts to and from little-endian for SSE. A specialized
748  // CHAM implementation can avoid the shuffle by framing the data for
749  // encryption, decryption and benchmarks. The library cannot take the
750  // speed-up because of the byte oriented API.
751  const __m128i r1 = _mm_unpackhi_epi32(a, b);
752  const __m128i r2 = _mm_unpackhi_epi32(c, d);
753  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
754  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
755 }
756 
757 template <unsigned int IDX>
758 inline __m128i UnpackXMM(const __m128i& v)
759 {
760  // Should not be instantiated
761  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
762  return _mm_setzero_si128();
763 }
764 
765 template <>
766 inline __m128i UnpackXMM<0>(const __m128i& v)
767 {
768  return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
769 }
770 
771 template <>
772 inline __m128i UnpackXMM<1>(const __m128i& v)
773 {
774  return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
775 }
776 
777 template <>
778 inline __m128i UnpackXMM<2>(const __m128i& v)
779 {
780  return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
781 }
782 
783 template <>
784 inline __m128i UnpackXMM<3>(const __m128i& v)
785 {
786  return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
787 }
788 
789 template <unsigned int IDX>
790 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
791 {
792  return UnpackXMM<IDX>(a, b, c, d);
793 }
794 
795 template <unsigned int IDX>
796 inline __m128i RepackXMM(const __m128i& v)
797 {
798  return UnpackXMM<IDX>(v);
799 }
800 
801 inline void CHAM128_Enc_Block(__m128i &block0,
802  const word32 *subkeys, unsigned int rounds)
803 {
804  // Rearrange the data for vectorization. UnpackXMM includes a
805  // little-endian swap for SSE. Thanks to Peter Cordes for help
806  // with packing and unpacking.
807  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
808  __m128i a = UnpackXMM<0>(block0);
809  __m128i b = UnpackXMM<1>(block0);
810  __m128i c = UnpackXMM<2>(block0);
811  __m128i d = UnpackXMM<3>(block0);
812 
813  __m128i counter = _mm_set_epi32(0,0,0,0);
814  __m128i increment = _mm_set_epi32(1,1,1,1);
815 
816  const unsigned int MASK = (rounds == 80 ? 7 : 15);
817  for (int i=0; i<static_cast<int>(rounds); i+=4)
818  {
819  __m128i k, k1, k2, t1, t2;
820  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
821 
822  // Shuffle out two subkeys
823  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
824  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
825 
826  t1 = _mm_xor_si128(a, counter);
827  t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
828  a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
829 
830  counter = _mm_add_epi32(counter, increment);
831 
832  t1 = _mm_xor_si128(b, counter);
833  t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
834  b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
835 
836  counter = _mm_add_epi32(counter, increment);
837 
838  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+2) & MASK])));
839 
840  // Shuffle out two subkeys
841  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
842  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
843 
844  t1 = _mm_xor_si128(c, counter);
845  t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
846  c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
847 
848  counter = _mm_add_epi32(counter, increment);
849 
850  t1 = _mm_xor_si128(d, counter);
851  t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
852  d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
853 
854  counter = _mm_add_epi32(counter, increment);
855  }
856 
857  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
858  block0 = RepackXMM<0>(a,b,c,d);
859 }
860 
861 inline void CHAM128_Dec_Block(__m128i &block0,
862  const word32 *subkeys, unsigned int rounds)
863 {
864  // Rearrange the data for vectorization. UnpackXMM includes a
865  // little-endian swap for SSE. Thanks to Peter Cordes for help
866  // with packing and unpacking.
867  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
868  __m128i a = UnpackXMM<0>(block0);
869  __m128i b = UnpackXMM<1>(block0);
870  __m128i c = UnpackXMM<2>(block0);
871  __m128i d = UnpackXMM<3>(block0);
872 
873  __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
874  __m128i decrement = _mm_set_epi32(1,1,1,1);
875 
876  const unsigned int MASK = (rounds == 80 ? 7 : 15);
877  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
878  {
879  __m128i k, k1, k2, t1, t2;
880  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-1) & MASK])));
881 
882  // Shuffle out two subkeys
883  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
884  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
885 
886  // Odd round
887  t1 = RotateRight32<1>(d);
888  t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
889  d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
890 
891  counter = _mm_sub_epi32(counter, decrement);
892 
893  // Even round
894  t1 = RotateRight32<8>(c);
895  t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
896  c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
897 
898  counter = _mm_sub_epi32(counter, decrement);
899  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
900 
901  // Shuffle out two subkeys
902  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
903  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
904 
905  // Odd round
906  t1 = RotateRight32<1>(b);
907  t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
908  b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
909 
910  counter = _mm_sub_epi32(counter, decrement);
911 
912  // Even round
913  t1 = RotateRight32<8>(a);
914  t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
915  a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
916 
917  counter = _mm_sub_epi32(counter, decrement);
918  }
919 
920  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
921  block0 = RepackXMM<0>(a,b,c,d);
922 }
923 
924 inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
925  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
926 {
927  // Rearrange the data for vectorization. UnpackXMM includes a
928  // little-endian swap for SSE. Thanks to Peter Cordes for help
929  // with packing and unpacking.
930  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
931  __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
932  __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
933  __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
934  __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
935 
936  __m128i counter = _mm_set_epi32(0,0,0,0);
937  __m128i increment = _mm_set_epi32(1,1,1,1);
938 
939  const unsigned int MASK = (rounds == 80 ? 7 : 15);
940  for (int i=0; i<static_cast<int>(rounds); i+=4)
941  {
942  __m128i k, k1, k2, t1, t2;
943  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
944 
945  // Shuffle out two subkeys
946  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
947  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
948 
949  t1 = _mm_xor_si128(a, counter);
950  t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
951  a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
952 
953  counter = _mm_add_epi32(counter, increment);
954 
955  t1 = _mm_xor_si128(b, counter);
956  t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
957  b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
958 
959  counter = _mm_add_epi32(counter, increment);
960  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+2) & MASK])));
961 
962  // Shuffle out two subkeys
963  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
964  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
965 
966  t1 = _mm_xor_si128(c, counter);
967  t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
968  c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
969 
970  counter = _mm_add_epi32(counter, increment);
971 
972  t1 = _mm_xor_si128(d, counter);
973  t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
974  d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
975 
976  counter = _mm_add_epi32(counter, increment);
977  }
978 
979  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
980  block0 = RepackXMM<0>(a,b,c,d);
981  block1 = RepackXMM<1>(a,b,c,d);
982  block2 = RepackXMM<2>(a,b,c,d);
983  block3 = RepackXMM<3>(a,b,c,d);
984 }
985 
986 inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
987  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
988 {
989  // Rearrange the data for vectorization. UnpackXMM includes a
990  // little-endian swap for SSE. Thanks to Peter Cordes for help
991  // with packing and unpacking.
992  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
993  __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
994  __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
995  __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
996  __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
997 
998  __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
999  __m128i decrement = _mm_set_epi32(1,1,1,1);
1000 
1001  const unsigned int MASK = (rounds == 80 ? 7 : 15);
1002  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
1003  {
1004  __m128i k, k1, k2, t1, t2;
1005  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-1) & MASK])));
1006 
1007  // Shuffle out two subkeys
1008  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1009  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1010 
1011  // Odd round
1012  t1 = RotateRight32<1>(d);
1013  t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
1014  d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1015 
1016  counter = _mm_sub_epi32(counter, decrement);
1017 
1018  // Even round
1019  t1 = RotateRight32<8>(c);
1020  t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
1021  c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1022 
1023  counter = _mm_sub_epi32(counter, decrement);
1024  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
1025 
1026  // Shuffle out two subkeys
1027  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1028  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1029 
1030  // Odd round
1031  t1 = RotateRight32<1>(b);
1032  t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
1033  b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1034 
1035  counter = _mm_sub_epi32(counter, decrement);
1036 
1037  // Even round
1038  t1 = RotateRight32<8>(a);
1039  t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
1040  a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1041 
1042  counter = _mm_sub_epi32(counter, decrement);
1043  }
1044 
1045  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
1046  block0 = RepackXMM<0>(a,b,c,d);
1047  block1 = RepackXMM<1>(a,b,c,d);
1048  block2 = RepackXMM<2>(a,b,c,d);
1049  block3 = RepackXMM<3>(a,b,c,d);
1050 }
1051 
1052 //////////////////////////////////////////////////////////////////////////
1053 
1054 NAMESPACE_END // W32
1055 
1056 #endif // CRYPTOPP_SSSE3_AVAILABLE
1057 
1058 ANONYMOUS_NAMESPACE_END
1059 
1060 NAMESPACE_BEGIN(CryptoPP)
1061 
1062 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1063 size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
1064  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1065 {
1066  return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
1067  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1068 }
1069 
1070 size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
1071  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1072 {
1073  return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
1074  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1075 }
1076 
1077 size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1078  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1079 {
1080  return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
1081  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1082 }
1083 
1084 size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1085  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1086 {
1087  return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
1088  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1089 }
1090 #endif // CRYPTOPP_SSSE3_AVAILABLE
1091 
1092 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
Classes for the CHAM block cipher.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Crypto++ library namespace.