2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 .file "cast5-avx-x86_64-asm_64.S"
33 /* structure of crypto context */
36 #define rr ((16*4)+16)
44 /**********************************************************************
46 **********************************************************************/
97 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
98 movzbl src ## bh, RID1d; \
99 movzbl src ## bl, RID2d; \
101 movl s1(, RID1, 4), dst ## d; \
102 op1 s2(, RID2, 4), dst ## d; \
103 movzbl src ## bh, RID1d; \
104 movzbl src ## bl, RID2d; \
105 interleave_op(il_reg); \
106 op2 s3(, RID1, 4), dst ## d; \
107 op3 s4(, RID2, 4), dst ## d;
109 #define dummy(d) /* do nothing */
111 #define shr_next(reg) \
114 #define F_head(a, x, gi1, gi2, op0) \
116 vpslld RKRF, x, RTMP; \
123 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
124 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
125 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
127 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
130 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
135 vpinsrq $1, RFS3, x, x;
137 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
138 F_head(b1, RX, RGI1, RGI2, op0); \
139 F_head(b2, RX, RGI3, RGI4, op0); \
141 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
142 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
147 #define F1_2(a1, b1, a2, b2) \
148 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
149 #define F2_2(a1, b1, a2, b2) \
150 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
151 #define F3_2(a1, b1, a2, b2) \
152 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
154 #define subround(a1, b1, a2, b2, f) \
155 F ## f ## _2(a1, b1, a2, b2);
157 #define round(l, r, n, f) \
158 vbroadcastss (km+(4*n))(CTX), RKM; \
159 vpand R1ST, RKR, RKRF; \
160 vpsubq RKRF, R32, RKRR; \
161 vpsrldq $1, RKR, RKR; \
162 subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
163 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
165 #define enc_preload_rkr() \
166 vbroadcastss .L16_mask, RKR; \
167 /* add 16-bit rotation to key rotations (mod 32) */ \
168 vpxor kr(CTX), RKR, RKR;
170 #define dec_preload_rkr() \
171 vbroadcastss .L16_mask, RKR; \
172 /* add 16-bit rotation to key rotations (mod 32) */ \
173 vpxor kr(CTX), RKR, RKR; \
174 vpshufb .Lbswap128_mask, RKR, RKR;
176 #define transpose_2x4(x0, x1, t0, t1) \
177 vpunpckldq x1, x0, t0; \
178 vpunpckhdq x1, x0, t1; \
180 vpunpcklqdq t1, t0, x0; \
181 vpunpckhqdq t1, t0, x1;
183 #define inpack_blocks(in, x0, x1, t0, t1, rmask) \
184 vmovdqu (0*4*4)(in), x0; \
185 vmovdqu (1*4*4)(in), x1; \
186 vpshufb rmask, x0, x0; \
187 vpshufb rmask, x1, x1; \
189 transpose_2x4(x0, x1, t0, t1)
191 #define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
192 transpose_2x4(x0, x1, t0, t1) \
194 vpshufb rmask, x0, x0; \
195 vpshufb rmask, x1, x1; \
196 vmovdqu x0, (0*4*4)(out); \
197 vmovdqu x1, (1*4*4)(out);
199 #define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
200 transpose_2x4(x0, x1, t0, t1) \
202 vpshufb rmask, x0, x0; \
203 vpshufb rmask, x1, x1; \
204 vpxor (0*4*4)(out), x0, x0; \
205 vmovdqu x0, (0*4*4)(out); \
206 vpxor (1*4*4)(out), x1, x1; \
207 vmovdqu x1, (1*4*4)(out);
213 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
215 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
226 .global __cast5_enc_blk_16way
227 .type __cast5_enc_blk_16way,@function;
229 __cast5_enc_blk_16way:
234 * %rcx: bool, if true: xor output
241 vmovdqa .Lbswap_mask, RKM;
242 vmovd .Lfirst_mask, R1ST;
243 vmovd .L32_mask, R32;
246 leaq 1*(2*4*4)(%rdx), %rax;
247 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
248 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
249 leaq 2*(2*4*4)(%rdx), %rax;
250 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
251 leaq 3*(2*4*4)(%rdx), %rax;
252 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
266 round(RL, RR, 10, 2);
267 round(RR, RL, 11, 3);
269 movzbl rr(CTX), %eax;
273 round(RL, RR, 12, 1);
274 round(RR, RL, 13, 2);
275 round(RL, RR, 14, 3);
276 round(RR, RL, 15, 1);
283 vmovdqa .Lbswap_mask, RKM;
284 leaq 1*(2*4*4)(%r11), %rax;
289 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
290 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
291 leaq 2*(2*4*4)(%r11), %rax;
292 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
293 leaq 3*(2*4*4)(%r11), %rax;
294 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
299 outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
300 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
301 leaq 2*(2*4*4)(%r11), %rax;
302 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
303 leaq 3*(2*4*4)(%r11), %rax;
304 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
309 .global cast5_dec_blk_16way
310 .type cast5_dec_blk_16way,@function;
322 vmovdqa .Lbswap_mask, RKM;
323 vmovd .Lfirst_mask, R1ST;
324 vmovd .L32_mask, R32;
327 leaq 1*(2*4*4)(%rdx), %rax;
328 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
329 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
330 leaq 2*(2*4*4)(%rdx), %rax;
331 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
332 leaq 3*(2*4*4)(%rdx), %rax;
333 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
337 movzbl rr(CTX), %eax;
341 round(RL, RR, 15, 1);
342 round(RR, RL, 14, 3);
343 round(RL, RR, 13, 2);
344 round(RR, RL, 12, 1);
347 round(RL, RR, 11, 3);
348 round(RR, RL, 10, 2);
360 vmovdqa .Lbswap_mask, RKM;
364 leaq 1*(2*4*4)(%r11), %rax;
365 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
366 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
367 leaq 2*(2*4*4)(%r11), %rax;
368 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
369 leaq 3*(2*4*4)(%r11), %rax;
370 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
375 vpsrldq $4, RKR, RKR;