Crypto++  8.0
Free C++ class library of cryptographic schemes
donna_sse.cpp
1 // donna_sse.cpp - written and placed in public domain by Jeffrey Walton
2 // This is a integration of Andrew Moon's public domain code.
3 // Also see https://github.com/floodyberry/curve25519-donna.
4 
5 // This is a integration of Andrew Moon's public domain code. The port was
6 // clean, but it has one potential problem. The original code is C and relies
7 // upon unions. Accessing the inactive union member is undefined behavior in
8 // C++. That means copying the array into packedelem8.u is OK; but then using
9 // packedelem8.v in a calcualtion is UB. Fortunately most (all?) compilers
10 // take pity on C++ developers and compile the code. We will have to keep an
11 // eye on things or rewrite significant portions of this code.
12 
13 // If needed, see Moon's commit "Go back to ignoring 256th bit [sic]",
14 // https://github.com/floodyberry/curve25519-donna/commit/57a683d18721a658
15 
16 #include "pch.h"
17 
18 #include "config.h"
19 #include "donna.h"
20 #include "secblock.h"
21 #include "misc.h"
22 
23 // The data is aligned, but Clang issues warning based on type
24 // and not the actual alignment of the variable and data.
25 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
26 # pragma GCC diagnostic ignored "-Wcast-align"
27 #endif
28 
29 // Squash MS LNK4221 and libtool warnings
30 extern const char DONNA_SSE_FNAME[] = __FILE__;
31 
32 #if (CRYPTOPP_CURVE25519_SSE2)
33 
34 #include "donna_sse.h"
35 
36 ANONYMOUS_NAMESPACE_BEGIN
37 
38 using CryptoPP::byte;
39 using CryptoPP::word32;
40 using CryptoPP::sword32;
41 using CryptoPP::word64;
42 using CryptoPP::sword64;
43 using CryptoPP::GetBlock;
45 
46 // Bring in all the symbols from the SSE header
47 using namespace CryptoPP::Donna::ArchSSE;
48 
49 /* Copy a bignum to another: out = in */
50 inline void
51 curve25519_copy(bignum25519 out, const bignum25519 in) {
52  xmmi x0,x1,x2;
53  x0 = _mm_load_si128((xmmi*)in + 0);
54  x1 = _mm_load_si128((xmmi*)in + 1);
55  x2 = _mm_load_si128((xmmi*)in + 2);
56  _mm_store_si128((xmmi*)out + 0, x0);
57  _mm_store_si128((xmmi*)out + 1, x1);
58  _mm_store_si128((xmmi*)out + 2, x2);
59 }
60 
61 /* Take a little-endian, 32-byte number and expand it into polynomial form */
62 inline void
63 curve25519_expand(bignum25519 out, const byte in[32]) {
64  word32 x0,x1,x2,x3,x4,x5,x6,x7;
65 
66  x0 = *(word32 *)(in + 0);
67  x1 = *(word32 *)(in + 4);
68  x2 = *(word32 *)(in + 8);
69  x3 = *(word32 *)(in + 12);
70  x4 = *(word32 *)(in + 16);
71  x5 = *(word32 *)(in + 20);
72  x6 = *(word32 *)(in + 24);
73  x7 = *(word32 *)(in + 28);
74 
75  out[0] = ( x0 ) & reduce_mask_26;
76  out[1] = ((((word64)x1 << 32) | x0) >> 26) & reduce_mask_25;
77  out[2] = ((((word64)x2 << 32) | x1) >> 19) & reduce_mask_26;
78  out[3] = ((((word64)x3 << 32) | x2) >> 13) & reduce_mask_25;
79  out[4] = (( x3) >> 6) & reduce_mask_26;
80  out[5] = ( x4 ) & reduce_mask_25;
81  out[6] = ((((word64)x5 << 32) | x4) >> 25) & reduce_mask_26;
82  out[7] = ((((word64)x6 << 32) | x5) >> 19) & reduce_mask_25;
83  out[8] = ((((word64)x7 << 32) | x6) >> 12) & reduce_mask_26;
84  out[9] = (( x7) >> 6) & reduce_mask_25; /* ignore the top bit */
85 
86  out[10] = 0;
87  out[11] = 0;
88 }
89 
90 /* Take a fully reduced polynomial form number and contract it into a
91  * little-endian, 32-byte array
92  */
93 inline void
94 curve25519_contract(byte out[32], const bignum25519 in) {
95  ALIGN(16) bignum25519 f;
96 
97  curve25519_copy(f, in);
98 
99  #define carry_pass() \
100  f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \
101  f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \
102  f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \
103  f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \
104  f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \
105  f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \
106  f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \
107  f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \
108  f[9] += f[8] >> 26; f[8] &= reduce_mask_26;
109 
110  #define carry_pass_full() \
111  carry_pass() \
112  f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25;
113 
114  #define carry_pass_final() \
115  carry_pass() \
116  f[9] &= reduce_mask_25;
117 
118  carry_pass_full()
119  carry_pass_full()
120 
121  /* now t is between 0 and 2^255-1, properly carried. */
122  /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
123  f[0] += 19;
124  carry_pass_full()
125 
126  /* now between 19 and 2^255-1 in both cases, and offset by 19. */
127  f[0] += (1 << 26) - 19;
128  f[1] += (1 << 25) - 1;
129  f[2] += (1 << 26) - 1;
130  f[3] += (1 << 25) - 1;
131  f[4] += (1 << 26) - 1;
132  f[5] += (1 << 25) - 1;
133  f[6] += (1 << 26) - 1;
134  f[7] += (1 << 25) - 1;
135  f[8] += (1 << 26) - 1;
136  f[9] += (1 << 25) - 1;
137 
138  /* now between 2^255 and 2^256-20, and offset by 2^255. */
139  carry_pass_final()
140 
141  #undef carry_pass
142  #undef carry_full
143  #undef carry_final
144 
145  *(word32 *)(out + 0) = ((f[0] ) | (f[1] << 26));
146  *(word32 *)(out + 4) = ((f[1] >> 6) | (f[2] << 19));
147  *(word32 *)(out + 8) = ((f[2] >> 13) | (f[3] << 13));
148  *(word32 *)(out + 12) = ((f[3] >> 19) | (f[4] << 6));
149  *(word32 *)(out + 16) = ((f[5] ) | (f[6] << 25));
150  *(word32 *)(out + 20) = ((f[6] >> 7) | (f[7] << 19));
151  *(word32 *)(out + 24) = ((f[7] >> 13) | (f[8] << 12));
152  *(word32 *)(out + 28) = ((f[8] >> 20) | (f[9] << 6));
153 }
154 
155 /*
156  * Maybe swap the contents of two felem arrays (@a and @b), each 5 elements
157  * long. Perform the swap iff @swap is non-zero.
158  */
159 inline void
160 curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
161  const word32 swap = (word32)(-(sword32)iswap);
162  xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
163  xmmi mask = _mm_cvtsi32_si128(swap);
164  mask = _mm_shuffle_epi32(mask, 0);
165  a0 = _mm_load_si128((xmmi *)a + 0);
166  a1 = _mm_load_si128((xmmi *)a + 1);
167  a2 = _mm_load_si128((xmmi *)a + 2);
168  b0 = _mm_load_si128((xmmi *)b + 0);
169  b1 = _mm_load_si128((xmmi *)b + 1);
170  b2 = _mm_load_si128((xmmi *)b + 2);
171  b0 = _mm_xor_si128(a0, b0);
172  b1 = _mm_xor_si128(a1, b1);
173  b2 = _mm_xor_si128(a2, b2);
174  x0 = _mm_and_si128(b0, mask);
175  x1 = _mm_and_si128(b1, mask);
176  x2 = _mm_and_si128(b2, mask);
177  x0 = _mm_xor_si128(x0, a0);
178  x1 = _mm_xor_si128(x1, a1);
179  x2 = _mm_xor_si128(x2, a2);
180  a0 = _mm_xor_si128(x0, b0);
181  a1 = _mm_xor_si128(x1, b1);
182  a2 = _mm_xor_si128(x2, b2);
183  _mm_store_si128((xmmi *)a + 0, x0);
184  _mm_store_si128((xmmi *)a + 1, x1);
185  _mm_store_si128((xmmi *)a + 2, x2);
186  _mm_store_si128((xmmi *)b + 0, a0);
187  _mm_store_si128((xmmi *)b + 1, a1);
188  _mm_store_si128((xmmi *)b + 2, a2);
189 }
190 
191 /* interleave two bignums */
192 inline void
193 curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
194  xmmi x0,x1,x2,z0,z1,z2;
195 
196  x0 = _mm_load_si128((xmmi *)(x + 0));
197  x1 = _mm_load_si128((xmmi *)(x + 4));
198  x2 = _mm_load_si128((xmmi *)(x + 8));
199  z0 = _mm_load_si128((xmmi *)(z + 0));
200  z1 = _mm_load_si128((xmmi *)(z + 4));
201  z2 = _mm_load_si128((xmmi *)(z + 8));
202 
203  out[0].v = _mm_unpacklo_epi32(x0, z0);
204  out[1].v = _mm_unpackhi_epi32(x0, z0);
205  out[2].v = _mm_unpacklo_epi32(x1, z1);
206  out[3].v = _mm_unpackhi_epi32(x1, z1);
207  out[4].v = _mm_unpacklo_epi32(x2, z2);
208 }
209 
210 /* split a packed bignum in to it's two parts */
211 inline void
212 curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
213  _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
214  _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
215  _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
216  _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
217  _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
218  _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
219 }
220 
221 /* add two packed bignums */
222 inline void
223 curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
224  out[0].v = _mm_add_epi32(r[0].v, s[0].v);
225  out[1].v = _mm_add_epi32(r[1].v, s[1].v);
226  out[2].v = _mm_add_epi32(r[2].v, s[2].v);
227  out[3].v = _mm_add_epi32(r[3].v, s[3].v);
228  out[4].v = _mm_add_epi32(r[4].v, s[4].v);
229 }
230 
231 /* subtract two packed bignums */
232 inline void
233 curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
234  xmmi r0,r1,r2,r3,r4;
235  xmmi s0,s1,s2,s3;
236  xmmi c1,c2;
237 
238  r0 = _mm_add_epi32(r[0].v, packed32zeromodp0.v);
239  r1 = _mm_add_epi32(r[1].v, packed32zeromodp1.v);
240  r2 = _mm_add_epi32(r[2].v, packed32zeromodp1.v);
241  r3 = _mm_add_epi32(r[3].v, packed32zeromodp1.v);
242  r4 = _mm_add_epi32(r[4].v, packed32zeromodp1.v);
243  r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
244  r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
245  r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
246  r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
247  r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
248 
249  s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
250  s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
251  s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
252  s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
253 
254  c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
255  c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
256 
257  out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
258  out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
259  out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
260  out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
261  out[4].v = r4; /* 88 99 */
262 }
263 
264 /* multiply two packed bignums */
265 inline void
266 curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
267  xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
268  xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
269  xmmi c1,c2;
270 
271  out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
272  out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
273  r1_2 = _mm_slli_epi32(r[1].v, 1);
274  out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
275  out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
276  r3_2 = _mm_slli_epi32(r[3].v, 1);
277  out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
278  out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
279  r5_2 = _mm_slli_epi32(r[5].v, 1);
280  out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
281  out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
282  r7_2 = _mm_slli_epi32(r[7].v, 1);
283  out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
284  out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
285 
286  r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
287  r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
288  r1_2 = _mm_slli_epi32(r1, 1);
289  r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
290  r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
291  r3_2 = _mm_slli_epi32(r3, 1);
292  r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
293  r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
294  r5_2 = _mm_slli_epi32(r5, 1);
295  r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
296  r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
297  r7_2 = _mm_slli_epi32(r7, 1);
298  r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
299  r9_2 = _mm_slli_epi32(r9, 1);
300 
301  out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
302  out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
303  out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
304  out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
305  out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
306  out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
307  out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
308  out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
309  out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
310 
311  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
312  c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
313  c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
314  c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
315  c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
316  c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
317  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
318 }
319 
320 /* multiply a bignum */
321 void
322 curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
323  xmmi m01,m23,m45,m67,m89;
324  xmmi m0123,m4567;
325  xmmi s0123,s4567;
326  xmmi s01,s23,s45,s67,s89;
327  xmmi s12,s34,s56,s78,s9;
328  xmmi r0,r2,r4,r6,r8;
329  xmmi r1,r3,r5,r7,r9;
330  xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
331  xmmi c1,c2,c3;
332 
333  s0123 = _mm_load_si128((xmmi*)s + 0);
334  s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
335  s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
336  s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
337  s4567 = _mm_load_si128((xmmi*)s + 1);
338  s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
339  s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
340  s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
341  s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
342  s89 = _mm_load_si128((xmmi*)s + 2);
343  s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
344  s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
345  s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
346 
347  r0 = _mm_load_si128((xmmi*)r + 0);
348  r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
349  r1 = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_top64bitmask.v));
350  r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
351  r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
352  r3 = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_top64bitmask.v));
353  r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
354  r4 = _mm_load_si128((xmmi*)r + 1);
355  r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
356  r5 = _mm_add_epi64(r5, _mm_and_si128(r5, sse2_top64bitmask.v));
357  r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
358  r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
359  r7 = _mm_add_epi64(r7, _mm_and_si128(r7, sse2_top64bitmask.v));
360  r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
361  r8 = _mm_load_si128((xmmi*)r + 2);
362  r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
363  r9 = _mm_add_epi64(r9, _mm_and_si128(r9, sse2_top64bitmask.v));
364  r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
365 
366  m01 = _mm_mul_epu32(r1,s01);
367  m23 = _mm_mul_epu32(r1,s23);
368  m45 = _mm_mul_epu32(r1,s45);
369  m67 = _mm_mul_epu32(r1,s67);
370  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
371  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
372  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
373  m89 = _mm_mul_epu32(r1,s89);
374  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
375  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
376  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
377  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
378  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
379  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
380  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
381 
382  /* shift up */
383  m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
384  m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
385  m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
386  m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
387  m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
388 
389  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
390  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
391  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
392  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
393  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
394  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
395  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
396  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
397  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
398  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
399  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
400  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
401  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
402  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
403  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
404 
405  r219 = _mm_mul_epu32(r2, packednineteen.v);
406  r419 = _mm_mul_epu32(r4, packednineteen.v);
407  r619 = _mm_mul_epu32(r6, packednineteen.v);
408  r819 = _mm_mul_epu32(r8, packednineteen.v);
409  r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
410  r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
411  r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
412  r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
413  r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
414 
415  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
416  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
417  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
418  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
419  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
420  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
421  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
422  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
423  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
424  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
425  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
426  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
427  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
428  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
429  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
430  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
431  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
432  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
433  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
434  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
435  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
436  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
437  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
438  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
439  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
440 
441  r0 = _mm_unpacklo_epi64(m01, m45);
442  r1 = _mm_unpackhi_epi64(m01, m45);
443  r2 = _mm_unpacklo_epi64(m23, m67);
444  r3 = _mm_unpackhi_epi64(m23, m67);
445  r4 = _mm_unpacklo_epi64(m89, m89);
446  r5 = _mm_unpackhi_epi64(m89, m89);
447 
448  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
449  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
450  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
451  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
452  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
453 
454  m0123 = _mm_unpacklo_epi32(r0, r1);
455  m4567 = _mm_unpackhi_epi32(r0, r1);
456  m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
457  m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
458  m89 = _mm_unpackhi_epi32(r4, r5);
459 
460  _mm_store_si128((xmmi*)out + 0, m0123);
461  _mm_store_si128((xmmi*)out + 1, m4567);
462  _mm_store_si128((xmmi*)out + 2, m89);
463 }
464 
465 typedef struct bignum25519mulprecomp_t {
466  xmmi r0,r2,r4,r6,r8;
467  xmmi r1,r3,r5,r7,r9;
468  xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
469 } bignum25519mulprecomp;
470 
471 /* precompute a constant to multiply by */
472 inline void
473 curve25519_mul_precompute(bignum25519mulprecomp *pre, const bignum25519 r) {
474  pre->r0 = _mm_load_si128((xmmi*)r + 0);
475  pre->r1 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(1,1,1,1));
476  pre->r1 = _mm_add_epi64(pre->r1, _mm_and_si128(pre->r1, sse2_top64bitmask.v));
477  pre->r2 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(2,2,2,2));
478  pre->r3 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(3,3,3,3));
479  pre->r3 = _mm_add_epi64(pre->r3, _mm_and_si128(pre->r3, sse2_top64bitmask.v));
480  pre->r0 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(0,0,0,0));
481  pre->r4 = _mm_load_si128((xmmi*)r + 1);
482  pre->r5 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(1,1,1,1));
483  pre->r5 = _mm_add_epi64(pre->r5, _mm_and_si128(pre->r5, sse2_top64bitmask.v));
484  pre->r6 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(2,2,2,2));
485  pre->r7 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(3,3,3,3));
486  pre->r7 = _mm_add_epi64(pre->r7, _mm_and_si128(pre->r7, sse2_top64bitmask.v));
487  pre->r4 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(0,0,0,0));
488  pre->r8 = _mm_load_si128((xmmi*)r + 2);
489  pre->r9 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,1,3,1));
490  pre->r9 = _mm_add_epi64(pre->r9, _mm_and_si128(pre->r9, sse2_top64bitmask.v));
491  pre->r8 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,0,3,0));
492 
493  pre->r219 = _mm_mul_epu32(pre->r2, packednineteen.v);
494  pre->r419 = _mm_mul_epu32(pre->r4, packednineteen.v);
495  pre->r619 = _mm_mul_epu32(pre->r6, packednineteen.v);
496  pre->r819 = _mm_mul_epu32(pre->r8, packednineteen.v);
497  pre->r119 = _mm_shuffle_epi32(pre->r1,_MM_SHUFFLE(0,0,2,2)); pre->r119 = _mm_mul_epu32(pre->r119, packednineteen.v);
498  pre->r319 = _mm_shuffle_epi32(pre->r3,_MM_SHUFFLE(0,0,2,2)); pre->r319 = _mm_mul_epu32(pre->r319, packednineteen.v);
499  pre->r519 = _mm_shuffle_epi32(pre->r5,_MM_SHUFFLE(0,0,2,2)); pre->r519 = _mm_mul_epu32(pre->r519, packednineteen.v);
500  pre->r719 = _mm_shuffle_epi32(pre->r7,_MM_SHUFFLE(0,0,2,2)); pre->r719 = _mm_mul_epu32(pre->r719, packednineteen.v);
501  pre->r919 = _mm_shuffle_epi32(pre->r9,_MM_SHUFFLE(0,0,2,2)); pre->r919 = _mm_mul_epu32(pre->r919, packednineteen.v);
502 }
503 
504 
505 /* multiply a bignum by a pre-computed constant */
506 inline void
507 curve25519_mul_precomputed(bignum25519 out, const bignum25519 s, const bignum25519mulprecomp *r) {
508  xmmi m01,m23,m45,m67,m89;
509  xmmi m0123,m4567;
510  xmmi s0123,s4567;
511  xmmi s01,s23,s45,s67,s89;
512  xmmi s12,s34,s56,s78,s9;
513  xmmi r0,r1,r2,r3,r4,r5;
514  xmmi c1,c2,c3;
515 
516  s0123 = _mm_load_si128((xmmi*)s + 0);
517  s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
518  s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
519  s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
520  s4567 = _mm_load_si128((xmmi*)s + 1);
521  s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
522  s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
523  s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
524  s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
525  s89 = _mm_load_si128((xmmi*)s + 2);
526  s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
527  s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
528  s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
529 
530  m01 = _mm_mul_epu32(r->r1,s01);
531  m23 = _mm_mul_epu32(r->r1,s23);
532  m45 = _mm_mul_epu32(r->r1,s45);
533  m67 = _mm_mul_epu32(r->r1,s67);
534  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r3,s01));
535  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r3,s23));
536  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r3,s45));
537  m89 = _mm_mul_epu32(r->r1,s89);
538  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r5,s01));
539  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r5,s23));
540  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r3,s67));
541  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r7,s01));
542  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r5,s45));
543  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r7,s23));
544  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r9,s01));
545 
546  /* shift up */
547  m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
548  m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
549  m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
550  m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
551  m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
552 
553  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r0,s01));
554  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r0,s23));
555  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r0,s45));
556  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r0,s67));
557  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r2,s01));
558  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r2,s23));
559  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r4,s23));
560  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r0,s89));
561  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r4,s01));
562  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r2,s45));
563  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r2,s67));
564  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r6,s01));
565  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r4,s45));
566  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r6,s23));
567  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r8,s01));
568  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r919,s12));
569  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r919,s34));
570  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r919,s56));
571  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r919,s78));
572  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r719,s34));
573  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r719,s56));
574  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r719,s78));
575  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r719,s9));
576  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r519,s56));
577  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r519,s78));
578  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r519,s9));
579  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r819,s89));
580  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r319,s78));
581  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r319,s9));
582  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r619,s89));
583  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r919,s9));
584  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r819,s23));
585  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r819,s45));
586  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r819,s67));
587  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r619,s45));
588  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r619,s67));
589  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r419,s67));
590  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r419,s89));
591  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r219,s89));
592  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r119,s9));
593 
594  r0 = _mm_unpacklo_epi64(m01, m45);
595  r1 = _mm_unpackhi_epi64(m01, m45);
596  r2 = _mm_unpacklo_epi64(m23, m67);
597  r3 = _mm_unpackhi_epi64(m23, m67);
598  r4 = _mm_unpacklo_epi64(m89, m89);
599  r5 = _mm_unpackhi_epi64(m89, m89);
600 
601  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
602  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
603  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
604  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
605  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
606 
607  m0123 = _mm_unpacklo_epi32(r0, r1);
608  m4567 = _mm_unpackhi_epi32(r0, r1);
609  m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
610  m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
611  m89 = _mm_unpackhi_epi32(r4, r5);
612 
613  _mm_store_si128((xmmi*)out + 0, m0123);
614  _mm_store_si128((xmmi*)out + 1, m4567);
615  _mm_store_si128((xmmi*)out + 2, m89);
616 }
617 
618 /* square a bignum 'count' times */
619 #define curve25519_square(r,x) curve25519_square_times(r,x,1)
620 
621 void
622 curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
623  xmmi m01,m23,m45,m67,m89;
624  xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
625  xmmi r0a,r1a,r2a,r3a,r7a,r9a;
626  xmmi r0123,r4567;
627  xmmi r01,r23,r45,r67,r6x,r89,r8x;
628  xmmi r12,r34,r56,r78,r9x;
629  xmmi r5619;
630  xmmi c1,c2,c3;
631 
632  r0123 = _mm_load_si128((xmmi*)in + 0);
633  r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
634  r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
635  r4567 = _mm_load_si128((xmmi*)in + 1);
636  r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
637  r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
638  r89 = _mm_load_si128((xmmi*)in + 2);
639  r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
640 
641  do {
642  r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
643  r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
644  r0 = _mm_add_epi64(r0, _mm_and_si128(r0, sse2_top64bitmask.v));
645  r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
646  r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
647  r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
648  r2 = _mm_add_epi64(r2, _mm_and_si128(r2, sse2_top64bitmask.v));
649  r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
650  r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
651  r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
652  r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
653  r4 = _mm_add_epi64(r4, _mm_and_si128(r4, sse2_top64bitmask.v));
654  r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
655  r5619 = _mm_mul_epu32(r56, packednineteen.v);
656  r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
657  r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
658  r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
659  r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
660  r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
661  r7 = _mm_mul_epu32(r7, packed3819.v);
662  r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
663  r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
664  r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
665  r8 = _mm_mul_epu32(r8, packednineteen.v);
666  r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
667  r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
668  r9 = _mm_mul_epu32(r9, packed3819.v);
669  r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
670 
671  m01 = _mm_mul_epu32(r01, r0);
672  m23 = _mm_mul_epu32(r23, r0a);
673  m45 = _mm_mul_epu32(r45, r0a);
674  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
675  r23 = _mm_slli_epi32(r23, 1);
676  m67 = _mm_mul_epu32(r67, r0a);
677  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
678  m89 = _mm_mul_epu32(r89, r0a);
679  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
680  r67 = _mm_slli_epi32(r67, 1);
681  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
682  r45 = _mm_slli_epi32(r45, 1);
683 
684  r1 = _mm_slli_epi32(r1, 1);
685  r3 = _mm_slli_epi32(r3, 1);
686  r1a = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_bot64bitmask.v));
687  r3a = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_bot64bitmask.v));
688 
689  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
690  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
691  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
692  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
693  r34 = _mm_slli_epi32(r34, 1);
694  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
695  r78 = _mm_slli_epi32(r78, 1);
696  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
697  r56 = _mm_slli_epi32(r56, 1);
698 
699  m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
700  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
701  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
702  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
703  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
704  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
705  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
706  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
707  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
708  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
709  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
710  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
711  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
712  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
713  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
714 
715  r0 = _mm_unpacklo_epi64(m01, m45);
716  r1 = _mm_unpackhi_epi64(m01, m45);
717  r2 = _mm_unpacklo_epi64(m23, m67);
718  r3 = _mm_unpackhi_epi64(m23, m67);
719  r4 = _mm_unpacklo_epi64(m89, m89);
720  r5 = _mm_unpackhi_epi64(m89, m89);
721 
722  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
723  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
724  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
725  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
726  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
727 
728  r01 = _mm_unpacklo_epi64(r0, r1);
729  r45 = _mm_unpackhi_epi64(r0, r1);
730  r23 = _mm_unpacklo_epi64(r2, r3);
731  r67 = _mm_unpackhi_epi64(r2, r3);
732  r89 = _mm_unpackhi_epi64(r4, r5);
733  } while (--count);
734 
735  r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
736  r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
737  r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
738  r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
739  r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
740 
741  _mm_store_si128((xmmi*)r + 0, r0123);
742  _mm_store_si128((xmmi*)r + 1, r4567);
743  _mm_store_si128((xmmi*)r + 2, r89);
744 }
745 
746 /* square two packed bignums */
747 inline void
748 curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
749  xmmi r0,r1,r2,r3;
750  xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
751  xmmi d5,d6,d7,d8,d9;
752  xmmi c1,c2;
753 
754  r0 = r[0].v;
755  r1 = r[1].v;
756  r2 = r[2].v;
757  r3 = r[3].v;
758 
759  out[0].v = _mm_mul_epu32(r0, r0);
760  r0 = _mm_slli_epi32(r0, 1);
761  out[1].v = _mm_mul_epu32(r0, r1);
762  r1_2 = _mm_slli_epi32(r1, 1);
763  out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
764  r1 = r1_2;
765  out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
766  r3_2 = _mm_slli_epi32(r3, 1);
767  out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
768  r2 = _mm_slli_epi32(r2, 1);
769  out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
770  r5_2 = _mm_slli_epi32(r[5].v, 1);
771  out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
772  r3 = r3_2;
773  out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
774  r7_2 = _mm_slli_epi32(r[7].v, 1);
775  out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
776  out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
777 
778  d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
779  d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
780  d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
781  d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
782  d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
783 
784  r4_2 = _mm_slli_epi32(r[4].v, 1);
785  r6_2 = _mm_slli_epi32(r[6].v, 1);
786  out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
787  out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
788  out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
789  out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
790  out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
791  out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
792  out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
793  out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
794  out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
795 
796  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
797  c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
798  c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
799  c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
800  c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
801  c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
802  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
803 }
804 
805 /* make [nqx+nqz,nqpqx+nqpqz], [nqpqx-nqpqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */
806 inline void
807 curve25519_make_nqpq(packedelem64 *primex, packedelem64 *primez, const packedelem32 *pqx, const packedelem32 *pqz) {
808  primex[0].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(1,1,0,0));
809  primex[1].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(3,3,2,2));
810  primex[2].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(1,1,0,0));
811  primex[3].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(3,3,2,2));
812  primex[4].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(1,1,0,0));
813  primex[5].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(3,3,2,2));
814  primex[6].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(1,1,0,0));
815  primex[7].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(3,3,2,2));
816  primex[8].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(1,1,0,0));
817  primex[9].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(3,3,2,2));
818  primez[0].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(0,0,1,1));
819  primez[1].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(2,2,3,3));
820  primez[2].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(0,0,1,1));
821  primez[3].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(2,2,3,3));
822  primez[4].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(0,0,1,1));
823  primez[5].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(2,2,3,3));
824  primez[6].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(0,0,1,1));
825  primez[7].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(2,2,3,3));
826  primez[8].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(0,0,1,1));
827  primez[9].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(2,2,3,3));
828 }
829 
830 /* make [nqx+nqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */
831 inline void
832 curve25519_make_nq(packedelem64 *nq, const packedelem32 *pqx, const packedelem32 *pqz) {
833  nq[0].v = _mm_unpacklo_epi64(pqx[0].v, pqz[0].v);
834  nq[1].v = _mm_unpackhi_epi64(pqx[0].v, pqz[0].v);
835  nq[2].v = _mm_unpacklo_epi64(pqx[1].v, pqz[1].v);
836  nq[3].v = _mm_unpackhi_epi64(pqx[1].v, pqz[1].v);
837  nq[4].v = _mm_unpacklo_epi64(pqx[2].v, pqz[2].v);
838  nq[5].v = _mm_unpackhi_epi64(pqx[2].v, pqz[2].v);
839  nq[6].v = _mm_unpacklo_epi64(pqx[3].v, pqz[3].v);
840  nq[7].v = _mm_unpackhi_epi64(pqx[3].v, pqz[3].v);
841  nq[8].v = _mm_unpacklo_epi64(pqx[4].v, pqz[4].v);
842  nq[9].v = _mm_unpackhi_epi64(pqx[4].v, pqz[4].v);
843 }
844 
845 /* compute [nqx+nqz,nqx-nqz] from nqx, nqz */
846 inline void
847 curve25519_compute_nq(packedelem64 *nq, const bignum25519 nqx, const bignum25519 nqz) {
848  xmmi x0,x1,x2;
849  xmmi z0,z1,z2;
850  xmmi a0,a1,a2;
851  xmmi s0,s1,s2;
852  xmmi r0,r1;
853  xmmi c1,c2;
854  x0 = _mm_load_si128((xmmi*)nqx + 0);
855  x1 = _mm_load_si128((xmmi*)nqx + 1);
856  x2 = _mm_load_si128((xmmi*)nqx + 2);
857  z0 = _mm_load_si128((xmmi*)nqz + 0);
858  z1 = _mm_load_si128((xmmi*)nqz + 1);
859  z2 = _mm_load_si128((xmmi*)nqz + 2);
860  a0 = _mm_add_epi32(x0, z0);
861  a1 = _mm_add_epi32(x1, z1);
862  a2 = _mm_add_epi32(x2, z2);
863  s0 = _mm_add_epi32(x0, packed2p0.v);
864  s1 = _mm_add_epi32(x1, packed2p1.v);
865  s2 = _mm_add_epi32(x2, packed2p2.v);
866  s0 = _mm_sub_epi32(s0, z0);
867  s1 = _mm_sub_epi32(s1, z1);
868  s2 = _mm_sub_epi32(s2, z2);
869  r0 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(2,2,0,0)), sse2_bot32bitmask.v);
870  r1 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(3,3,1,1)), sse2_bot32bitmask.v);
871  c1 = _mm_srli_epi32(r0, 26);
872  c2 = _mm_srli_epi32(r1, 25);
873  r0 = _mm_and_si128(r0, packedmask26.v);
874  r1 = _mm_and_si128(r1, packedmask25.v);
875  r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
876  r1 = _mm_add_epi32(r1, c1);
877  s0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
878  s1 = _mm_add_epi32(s1, _mm_srli_si128(c2, 8));
879  nq[0].v = _mm_unpacklo_epi64(a0, s0);
880  nq[2].v = _mm_unpackhi_epi64(a0, s0);
881  nq[4].v = _mm_unpacklo_epi64(a1, s1);
882  nq[6].v = _mm_unpackhi_epi64(a1, s1);
883  nq[8].v = _mm_unpacklo_epi64(a2, s2);
884  nq[1].v = _mm_shuffle_epi32(nq[0].v, _MM_SHUFFLE(3,3,1,1));
885  nq[3].v = _mm_shuffle_epi32(nq[2].v, _MM_SHUFFLE(3,3,1,1));
886  nq[5].v = _mm_shuffle_epi32(nq[4].v, _MM_SHUFFLE(3,3,1,1));
887  nq[7].v = _mm_shuffle_epi32(nq[6].v, _MM_SHUFFLE(3,3,1,1));
888  nq[9].v = _mm_shuffle_epi32(nq[8].v, _MM_SHUFFLE(3,3,1,1));
889 }
890 
891 
892 /* compute [x+z,x-z] from [x,z] */
893 inline void
894 curve25519_addsub_packed64(packedelem64 *r) {
895  packed32bignum25519 x,z,add,sub;
896 
897  x[0].v = _mm_unpacklo_epi64(r[0].v, r[1].v);
898  z[0].v = _mm_unpackhi_epi64(r[0].v, r[1].v);
899  x[1].v = _mm_unpacklo_epi64(r[2].v, r[3].v);
900  z[1].v = _mm_unpackhi_epi64(r[2].v, r[3].v);
901  x[2].v = _mm_unpacklo_epi64(r[4].v, r[5].v);
902  z[2].v = _mm_unpackhi_epi64(r[4].v, r[5].v);
903  x[3].v = _mm_unpacklo_epi64(r[6].v, r[7].v);
904  z[3].v = _mm_unpackhi_epi64(r[6].v, r[7].v);
905  x[4].v = _mm_unpacklo_epi64(r[8].v, r[9].v);
906  z[4].v = _mm_unpackhi_epi64(r[8].v, r[9].v);
907 
908  curve25519_add_packed32(add, x, z);
909  curve25519_sub_packed32(sub, x, z);
910 
911  r[0].v = _mm_unpacklo_epi64(add[0].v, sub[0].v);
912  r[1].v = _mm_unpackhi_epi64(add[0].v, sub[0].v);
913  r[2].v = _mm_unpacklo_epi64(add[1].v, sub[1].v);
914  r[3].v = _mm_unpackhi_epi64(add[1].v, sub[1].v);
915  r[4].v = _mm_unpacklo_epi64(add[2].v, sub[2].v);
916  r[5].v = _mm_unpackhi_epi64(add[2].v, sub[2].v);
917  r[6].v = _mm_unpacklo_epi64(add[3].v, sub[3].v);
918  r[7].v = _mm_unpackhi_epi64(add[3].v, sub[3].v);
919  r[8].v = _mm_unpacklo_epi64(add[4].v, sub[4].v);
920  r[9].v = _mm_unpackhi_epi64(add[4].v, sub[4].v);
921 }
922 
923 /* compute [x,z] * [121666,121665] */
924 inline void
925 curve25519_121665_packed64(packedelem64 *out, const packedelem64 *in) {
926  xmmi c1,c2;
927 
928  out[0].v = _mm_mul_epu32(in[0].v, packed121666121665.v);
929  out[1].v = _mm_mul_epu32(in[1].v, packed121666121665.v);
930  out[2].v = _mm_mul_epu32(in[2].v, packed121666121665.v);
931  out[3].v = _mm_mul_epu32(in[3].v, packed121666121665.v);
932  out[4].v = _mm_mul_epu32(in[4].v, packed121666121665.v);
933  out[5].v = _mm_mul_epu32(in[5].v, packed121666121665.v);
934  out[6].v = _mm_mul_epu32(in[6].v, packed121666121665.v);
935  out[7].v = _mm_mul_epu32(in[7].v, packed121666121665.v);
936  out[8].v = _mm_mul_epu32(in[8].v, packed121666121665.v);
937  out[9].v = _mm_mul_epu32(in[9].v, packed121666121665.v);
938 
939  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
940  c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
941  c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
942  c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
943  c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
944  c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
945  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
946 }
947 
948 /* compute [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
949 inline void
950 curve25519_final_nq(packedelem64 *nq, const packedelem64 *sq, const packedelem64 *sq121665) {
951  packed32bignum25519 x, z, sub;
952  packed64bignum25519 t, nqa, nqb;
953 
954  x[0].v = _mm_or_si128(_mm_unpacklo_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[0].v, sq121665[1].v), 4));
955  z[0].v = _mm_or_si128(_mm_unpackhi_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[0].v, sq121665[1].v), 4));
956  x[1].v = _mm_or_si128(_mm_unpacklo_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[2].v, sq121665[3].v), 4));
957  z[1].v = _mm_or_si128(_mm_unpackhi_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[2].v, sq121665[3].v), 4));
958  x[2].v = _mm_or_si128(_mm_unpacklo_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[4].v, sq121665[5].v), 4));
959  z[2].v = _mm_or_si128(_mm_unpackhi_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[4].v, sq121665[5].v), 4));
960  x[3].v = _mm_or_si128(_mm_unpacklo_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[6].v, sq121665[7].v), 4));
961  z[3].v = _mm_or_si128(_mm_unpackhi_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[6].v, sq121665[7].v), 4));
962  x[4].v = _mm_or_si128(_mm_unpacklo_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[8].v, sq121665[9].v), 4));
963  z[4].v = _mm_or_si128(_mm_unpackhi_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[8].v, sq121665[9].v), 4));
964 
965  curve25519_sub_packed32(sub, x, z);
966 
967  t[0].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(1,1,0,0));
968  t[1].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(3,3,2,2));
969  t[2].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(1,1,0,0));
970  t[3].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(3,3,2,2));
971  t[4].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(1,1,0,0));
972  t[5].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(3,3,2,2));
973  t[6].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(1,1,0,0));
974  t[7].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(3,3,2,2));
975  t[8].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(1,1,0,0));
976  t[9].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(3,3,2,2));
977 
978  nqa[0].v = _mm_unpacklo_epi64(sq[0].v, t[0].v);
979  nqb[0].v = _mm_unpackhi_epi64(sq[0].v, t[0].v);
980  nqa[1].v = _mm_unpacklo_epi64(sq[1].v, t[1].v);
981  nqb[1].v = _mm_unpackhi_epi64(sq[1].v, t[1].v);
982  nqa[2].v = _mm_unpacklo_epi64(sq[2].v, t[2].v);
983  nqb[2].v = _mm_unpackhi_epi64(sq[2].v, t[2].v);
984  nqa[3].v = _mm_unpacklo_epi64(sq[3].v, t[3].v);
985  nqb[3].v = _mm_unpackhi_epi64(sq[3].v, t[3].v);
986  nqa[4].v = _mm_unpacklo_epi64(sq[4].v, t[4].v);
987  nqb[4].v = _mm_unpackhi_epi64(sq[4].v, t[4].v);
988  nqa[5].v = _mm_unpacklo_epi64(sq[5].v, t[5].v);
989  nqb[5].v = _mm_unpackhi_epi64(sq[5].v, t[5].v);
990  nqa[6].v = _mm_unpacklo_epi64(sq[6].v, t[6].v);
991  nqb[6].v = _mm_unpackhi_epi64(sq[6].v, t[6].v);
992  nqa[7].v = _mm_unpacklo_epi64(sq[7].v, t[7].v);
993  nqb[7].v = _mm_unpackhi_epi64(sq[7].v, t[7].v);
994  nqa[8].v = _mm_unpacklo_epi64(sq[8].v, t[8].v);
995  nqb[8].v = _mm_unpackhi_epi64(sq[8].v, t[8].v);
996  nqa[9].v = _mm_unpacklo_epi64(sq[9].v, t[9].v);
997  nqb[9].v = _mm_unpackhi_epi64(sq[9].v, t[9].v);
998 
999  curve25519_mul_packed64(nq, nqa, nqb);
1000 }
1001 
1002 /*
1003  * In: b = 2^5 - 2^0
1004  * Out: b = 2^250 - 2^0
1005  */
1006 void
1007 curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {
1008  ALIGN(16) bignum25519 t0,c;
1009 
1010  /* 2^5 - 2^0 */ /* b */
1011  /* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5);
1012  /* 2^10 - 2^0 */ curve25519_mul(b, t0, b);
1013  /* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10);
1014  /* 2^20 - 2^0 */ curve25519_mul(c, t0, b);
1015  /* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20);
1016  /* 2^40 - 2^0 */ curve25519_mul(t0, t0, c);
1017  /* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10);
1018  /* 2^50 - 2^0 */ curve25519_mul(b, t0, b);
1019  /* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50);
1020  /* 2^100 - 2^0 */ curve25519_mul(c, t0, b);
1021  /* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100);
1022  /* 2^200 - 2^0 */ curve25519_mul(t0, t0, c);
1023  /* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50);
1024  /* 2^250 - 2^0 */ curve25519_mul(b, t0, b);
1025 }
1026 
1027 /*
1028  * z^(p - 2) = z(2^255 - 21)
1029  */
1030 void
1031 curve25519_recip(bignum25519 out, const bignum25519 z) {
1032  ALIGN(16) bignum25519 a, t0, b;
1033 
1034  /* 2 */ curve25519_square(a, z); /* a = 2 */
1035  /* 8 */ curve25519_square_times(t0, a, 2);
1036  /* 9 */ curve25519_mul(b, t0, z); /* b = 9 */
1037  /* 11 */ curve25519_mul(a, b, a); /* a = 11 */
1038  /* 22 */ curve25519_square(t0, a);
1039  /* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b);
1040  /* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b);
1041  /* 2^255 - 2^5 */ curve25519_square_times(b, b, 5);
1042  /* 2^255 - 21 */ curve25519_mul(out, b, a);
1043 }
1044 
1045 ANONYMOUS_NAMESPACE_END
1046 
1047 NAMESPACE_BEGIN(CryptoPP)
1048 NAMESPACE_BEGIN(Donna)
1049 
1050 int curve25519_mult_SSE2(byte sharedKey[32], const byte secretKey[32], const byte othersKey[32])
1051 {
1053  for (size_t i = 0;i < 32;++i)
1054  e[i] = secretKey[i];
1055  e[0] &= 0xf8; e[31] &= 0x7f; e[31] |= 0x40;
1056 
1057  ALIGN(16) bignum25519 nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;
1058  packed32bignum25519 qx, qz, pqz, pqx;
1059  packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
1060  bignum25519mulprecomp preq;
1061  size_t i=0, bit=0, lastbit=0;
1062 
1063  curve25519_expand(nqpqx, othersKey);
1064  curve25519_mul_precompute(&preq, nqpqx);
1065 
1066  /* do bits 254..3 */
1067  for (i = 254, lastbit=0; i >= 3; i--) {
1068  bit = (e[i/8] >> (i & 7)) & 1;
1069  curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
1070  curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
1071  lastbit = bit;
1072 
1073  curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
1074  curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */
1075 
1076  curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */
1077  curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */
1078 
1079  curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */
1080  curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */
1081  curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */
1082  curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */
1083  curve25519_untangle64(nqpqx, nqpqz, nqpq);
1084  curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */
1085 
1086  /* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */
1087  curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */
1088  curve25519_square_packed64(sq, nq); /* sq = nq^2 */
1089  curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
1090  curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
1091  curve25519_untangle64(nqx, nqz, nq);
1092  };
1093 
1094  /* it's possible to get rid of this swap with the swap in the above loop
1095  at the bottom instead of the top, but compilers seem to optimize better this way */
1096  curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
1097  curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
1098 
1099  /* do bits 2..0 */
1100  for (i = 0; i < 3; i++) {
1101  curve25519_compute_nq(nq, nqx, nqz);
1102  curve25519_square_packed64(sq, nq); /* sq = nq^2 */
1103  curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
1104  curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
1105  curve25519_untangle64(nqx, nqz, nq);
1106  }
1107 
1108  curve25519_recip(zmone, nqz);
1109  curve25519_mul(nqz, nqx, zmone);
1110  curve25519_contract(sharedKey, nqz);
1111 
1112  return 0;
1113 }
1114 
1115 NAMESPACE_END // Donna
1116 NAMESPACE_END // CryptoPP
1117 
1118 #endif // CRYPTOPP_CURVE25519_SSE2
Utility functions for the Crypto++ library.
EnumToType< ByteOrder, LITTLE_ENDIAN_ORDER > LittleEndian
Provides a constant for LittleEndian.
Definition: cryptlib.h:150
Library configuration file.
Classes and functions for secure memory allocations.
Precompiled header file.
Fixed size stack-based SecBlock.
Definition: secblock.h:1071
Crypto++ library namespace.