Crypto++  8.0
Free C++ class library of cryptographic schemes
sm4_simd.cpp
1 // sm4_simd.cpp - written and placed in the public domain by
2 // Markku-Juhani O. Saarinen and Jeffrey Walton
3 //
4 // This source file uses intrinsics and built-ins to gain access to
5 // AESNI, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
6 // source file is needed because additional CXXFLAGS are required to enable
7 // the appropriate instructions sets in some build configurations.
8 //
9 // AES-NI based on Markku-Juhani O. Saarinen work at https://github.com/mjosaarinen/sm4ni.
10 //
11 // ARMv8 is upcoming.
12 
13 #include "pch.h"
14 #include "config.h"
15 
16 #include "sm4.h"
17 #include "misc.h"
18 #include "adv_simd.h"
19 
20 // Uncomment for benchmarking C++ against SSE.
21 // Do so in both simon.cpp and simon-simd.cpp.
22 // #undef CRYPTOPP_AESNI_AVAILABLE
23 
24 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
25 # include <xmmintrin.h>
26 # include <emmintrin.h>
27 #endif
28 
29 #if (CRYPTOPP_AESNI_AVAILABLE)
30 # include <tmmintrin.h>
31 # include <wmmintrin.h>
32 #endif
33 
34 // Squash MS LNK4221 and libtool warnings
35 extern const char SM4_SIMD_FNAME[] = __FILE__;
36 
37 ANONYMOUS_NAMESPACE_BEGIN
38 
39 using CryptoPP::word32;
40 
41 #if (CRYPTOPP_AESNI_AVAILABLE)
42 
43 template <unsigned int R>
44 inline __m128i ShiftLeft(const __m128i& val)
45 {
46  return _mm_slli_epi32(val, R);
47 }
48 
49 template <unsigned int R>
50 inline __m128i ShiftRight(const __m128i& val)
51 {
52  return _mm_srli_epi32(val, R);
53 }
54 
55 template <unsigned int R>
56 inline __m128i ShiftLeft64(const __m128i& val)
57 {
58  return _mm_slli_epi64(val, R);
59 }
60 
61 template <unsigned int R>
62 inline __m128i ShiftRight64(const __m128i& val)
63 {
64  return _mm_srli_epi64(val, R);
65 }
66 
67 template <unsigned int R>
68 inline __m128i RotateLeft(const __m128i& val)
69 {
70  return _mm_or_si128(
71  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
72 }
73 
74 template <unsigned int R>
75 inline __m128i RotateRight(const __m128i& val)
76 {
77  return _mm_or_si128(
78  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
79 }
80 
81 template <>
82 inline __m128i RotateLeft<8>(const __m128i& val)
83 {
84  const __m128i r08 = _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003);
85  return _mm_shuffle_epi8(val, r08);
86 }
87 
88 template <>
89 inline __m128i RotateLeft<16>(const __m128i& val)
90 {
91  const __m128i mask = _mm_set_epi32(0x0D0C0F0E, 0x09080B0A, 0x05040706, 0x01000302);
92  return _mm_shuffle_epi8(val, mask);
93 }
94 
95 template <>
96 inline __m128i RotateLeft<24>(const __m128i& val)
97 {
98  const __m128i mask = _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201);
99  return _mm_shuffle_epi8(val, mask);
100 }
101 
102 /// \brief Unpack XMM words
103 /// \tparam IDX the element from each XMM word
104 /// \param a the first XMM word
105 /// \param b the second XMM word
106 /// \param c the third XMM word
107 /// \param d the fourth XMM word
108 /// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
109 /// equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
110 template <unsigned int IDX>
111 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
112 {
113  // Should not be instantiated
114  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
115  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
116  CRYPTOPP_ASSERT(0);
117  return _mm_setzero_si128();
118 }
119 
120 template <>
121 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
122 {
123  const __m128i r1 = _mm_unpacklo_epi32(a, b);
124  const __m128i r2 = _mm_unpacklo_epi32(c, d);
125  return _mm_unpacklo_epi64(r1, r2);
126 }
127 
128 template <>
129 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
130 {
131  const __m128i r1 = _mm_unpacklo_epi32(a, b);
132  const __m128i r2 = _mm_unpacklo_epi32(c, d);
133  return _mm_unpackhi_epi64(r1, r2);
134 }
135 
136 template <>
137 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
138 {
139  const __m128i r1 = _mm_unpackhi_epi32(a, b);
140  const __m128i r2 = _mm_unpackhi_epi32(c, d);
141  return _mm_unpacklo_epi64(r1, r2);
142 }
143 
144 template <>
145 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
146 {
147  const __m128i r1 = _mm_unpackhi_epi32(a, b);
148  const __m128i r2 = _mm_unpackhi_epi32(c, d);
149  return _mm_unpackhi_epi64(r1, r2);
150 }
151 
152 /// \brief Unpack a XMM word
153 /// \tparam IDX the element from each XMM word
154 /// \param v the first XMM word
155 /// \details UnpackXMM selects the IDX element from v and returns a concatenation
156 /// equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
157 template <unsigned int IDX>
158 inline __m128i UnpackXMM(const __m128i& v)
159 {
160  // Should not be instantiated
161  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
162  return _mm_setzero_si128();
163 }
164 
165 template <>
166 inline __m128i UnpackXMM<0>(const __m128i& v)
167 {
168  // Splat to all lanes
169  return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
170 }
171 
172 template <>
173 inline __m128i UnpackXMM<1>(const __m128i& v)
174 {
175  // Splat to all lanes
176  return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
177 }
178 
179 template <>
180 inline __m128i UnpackXMM<2>(const __m128i& v)
181 {
182  // Splat to all lanes
183  return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
184 }
185 
186 template <>
187 inline __m128i UnpackXMM<3>(const __m128i& v)
188 {
189  // Splat to all lanes
190  return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
191 }
192 
193 template <unsigned int IDX>
194 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
195 {
196  return UnpackXMM<IDX>(a, b, c, d);
197 }
198 
199 template <unsigned int IDX>
200 inline __m128i RepackXMM(const __m128i& v)
201 {
202  return UnpackXMM<IDX>(v);
203 }
204 
205 inline void SM4_Encrypt(__m128i &block0, __m128i &block1,
206  __m128i &block2, __m128i &block3, const word32 *subkeys)
207 {
208  // nibble mask
209  const __m128i c0f = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
210 
211  // flip all bytes in all 32-bit words
212  const __m128i flp = _mm_set_epi32(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203);
213 
214  // inverse shift rows
215  const __m128i shr = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00);
216 
217  // Affine transform 1 (low and high hibbles)
218  const __m128i m1l = _mm_set_epi32(0xC7C1B4B2, 0x22245157, 0x9197E2E4, 0x74720701);
219  const __m128i m1h = _mm_set_epi32(0xF052B91B, 0xF95BB012, 0xE240AB09, 0xEB49A200);
220 
221  // Affine transform 2 (low and high hibbles)
222  const __m128i m2l = _mm_set_epi32(0xEDD14478, 0x172BBE82, 0x5B67F2CE, 0xA19D0834);
223  const __m128i m2h = _mm_set_epi32(0x11CDBE62, 0xCC1063BF, 0xAE7201DD, 0x73AFDC00);
224 
225  __m128i t0 = UnpackXMM<0>(block0, block1, block2, block3);
226  __m128i t1 = UnpackXMM<1>(block0, block1, block2, block3);
227  __m128i t2 = UnpackXMM<2>(block0, block1, block2, block3);
228  __m128i t3 = UnpackXMM<3>(block0, block1, block2, block3);
229 
230  t0 = _mm_shuffle_epi8(t0, flp);
231  t1 = _mm_shuffle_epi8(t1, flp);
232  t2 = _mm_shuffle_epi8(t2, flp);
233  t3 = _mm_shuffle_epi8(t3, flp);
234 
235  const unsigned int ROUNDS = 32;
236  for (unsigned int i = 0; i < ROUNDS; i++)
237  {
238  const __m128i k = _mm_shuffle_epi32(_mm_castps_si128(
239  _mm_load_ss((const float*)(subkeys+i))), _MM_SHUFFLE(0,0,0,0));
240 
241  __m128i x, y;
242  x = _mm_xor_si128(t1, _mm_xor_si128(t2, _mm_xor_si128(t3, k)));
243 
244  y = _mm_and_si128(x, c0f); // inner affine
245  y = _mm_shuffle_epi8(m1l, y);
246  x = _mm_and_si128(ShiftRight64<4>(x), c0f);
247  x = _mm_xor_si128(_mm_shuffle_epi8(m1h, x), y);
248 
249  x = _mm_shuffle_epi8(x, shr); // inverse MixColumns
250  x = _mm_aesenclast_si128(x, c0f); // AESNI instruction
251 
252  y = _mm_andnot_si128(x, c0f); // outer affine
253  y = _mm_shuffle_epi8(m2l, y);
254  x = _mm_and_si128(ShiftRight64<4>(x), c0f);
255  x = _mm_xor_si128(_mm_shuffle_epi8(m2h, x), y);
256 
257  // 4 parallel L1 linear transforms
258  y = _mm_xor_si128(x, RotateLeft<8>(x));
259  y = _mm_xor_si128(y, RotateLeft<16>(x));
260  y = _mm_xor_si128(ShiftLeft<2>(y), ShiftRight<30>(y));
261  x = _mm_xor_si128(x, _mm_xor_si128(y, RotateLeft<24>(x)));
262 
263  // rotate registers
264  x = _mm_xor_si128(x, t0);
265  t0 = t1; t1 = t2;
266  t2 = t3; t3 = x;
267  }
268 
269  t0 = _mm_shuffle_epi8(t0, flp);
270  t1 = _mm_shuffle_epi8(t1, flp);
271  t2 = _mm_shuffle_epi8(t2, flp);
272  t3 = _mm_shuffle_epi8(t3, flp);
273 
274  block0 = RepackXMM<0>(t3,t2,t1,t0);
275  block1 = RepackXMM<1>(t3,t2,t1,t0);
276  block2 = RepackXMM<2>(t3,t2,t1,t0);
277  block3 = RepackXMM<3>(t3,t2,t1,t0);
278 }
279 
280 inline void SM4_Enc_4_Blocks(__m128i &block0, __m128i &block1,
281  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
282 {
283  SM4_Encrypt(block0, block1, block2, block3, subkeys);
284 }
285 
286 inline void SM4_Dec_4_Blocks(__m128i &block0, __m128i &block1,
287  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
288 {
289  SM4_Encrypt(block0, block1, block2, block3, subkeys);
290 }
291 
292 inline void SM4_Enc_Block(__m128i &block0,
293  const word32 *subkeys, unsigned int /*rounds*/)
294 {
295  __m128i t1 = _mm_setzero_si128();
296  __m128i t2 = _mm_setzero_si128();
297  __m128i t3 = _mm_setzero_si128();
298 
299  SM4_Encrypt(block0, t1, t2, t3, subkeys);
300 }
301 
302 inline void SM4_Dec_Block(__m128i &block0,
303  const word32 *subkeys, unsigned int /*rounds*/)
304 {
305  __m128i t1 = _mm_setzero_si128();
306  __m128i t2 = _mm_setzero_si128();
307  __m128i t3 = _mm_setzero_si128();
308 
309  SM4_Encrypt(block0, t1, t2, t3, subkeys);
310 }
311 
312 #endif // CRYPTOPP_AESNI_AVAILABLE
313 
314 ANONYMOUS_NAMESPACE_END
315 
316 NAMESPACE_BEGIN(CryptoPP)
317 
318 #if defined(CRYPTOPP_AESNI_AVAILABLE)
319 size_t SM4_Enc_AdvancedProcessBlocks_AESNI(const word32* subKeys, size_t rounds,
320  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
321 {
322  return AdvancedProcessBlocks128_4x1_SSE(SM4_Enc_Block, SM4_Enc_4_Blocks,
323  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
324 }
325 #endif // CRYPTOPP_AESNI_AVAILABLE
326 
327 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Classes for the SM4 block cipher.
Crypto++ library namespace.