2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 .file "cast6-avx-x86_64-asm_64.S"
33 /* structure of crypto context */
43 /**********************************************************************
45 **********************************************************************/
96 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
97 movzbl src ## bh, RID1d; \
98 movzbl src ## bl, RID2d; \
100 movl s1(, RID1, 4), dst ## d; \
101 op1 s2(, RID2, 4), dst ## d; \
102 movzbl src ## bh, RID1d; \
103 movzbl src ## bl, RID2d; \
104 interleave_op(il_reg); \
105 op2 s3(, RID1, 4), dst ## d; \
106 op3 s4(, RID2, 4), dst ## d;
108 #define dummy(d) /* do nothing */
110 #define shr_next(reg) \
113 #define F_head(a, x, gi1, gi2, op0) \
115 vpslld RKRF, x, RTMP; \
122 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
123 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
124 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
126 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
129 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
134 vpinsrq $1, RFS3, x, x;
136 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
137 F_head(b1, RX, RGI1, RGI2, op0); \
138 F_head(b2, RX, RGI3, RGI4, op0); \
140 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
141 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
146 #define F1_2(a1, b1, a2, b2) \
147 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
148 #define F2_2(a1, b1, a2, b2) \
149 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
150 #define F3_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
153 #define qop(in, out, f) \
154 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
156 #define get_round_keys(nn) \
157 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
158 vpand R1ST, RKR, RKRF; \
159 vpsubq RKRF, R32, RKRR; \
160 vpsrldq $1, RKR, RKR;
163 get_round_keys(4*n+0); \
166 get_round_keys(4*n+1); \
169 get_round_keys(4*n+2); \
172 get_round_keys(4*n+3); \
176 get_round_keys(4*n+3); \
179 get_round_keys(4*n+2); \
182 get_round_keys(4*n+1); \
185 get_round_keys(4*n+0); \
188 #define shuffle(mask) \
189 vpshufb mask, RKR, RKR;
191 #define preload_rkr(n, do_mask, mask) \
192 vbroadcastss .L16_mask, RKR; \
193 /* add 16-bit rotation to key rotations (mod 32) */ \
194 vpxor (kr+n*16)(CTX), RKR, RKR; \
197 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
198 vpunpckldq x1, x0, t0; \
199 vpunpckhdq x1, x0, t2; \
200 vpunpckldq x3, x2, t1; \
201 vpunpckhdq x3, x2, x3; \
203 vpunpcklqdq t1, t0, x0; \
204 vpunpckhqdq t1, t0, x1; \
205 vpunpcklqdq x3, t2, x2; \
206 vpunpckhqdq x3, t2, x3;
208 #define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
209 vmovdqu (0*4*4)(in), x0; \
210 vmovdqu (1*4*4)(in), x1; \
211 vmovdqu (2*4*4)(in), x2; \
212 vmovdqu (3*4*4)(in), x3; \
213 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \
216 vpshufb rmask, x3, x3; \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
220 #define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
223 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3; \
227 vmovdqu x0, (0*4*4)(out); \
228 vmovdqu x1, (1*4*4)(out); \
229 vmovdqu x2, (2*4*4)(out); \
230 vmovdqu x3, (3*4*4)(out);
232 #define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
233 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
235 vpshufb rmask, x0, x0; \
236 vpshufb rmask, x1, x1; \
237 vpshufb rmask, x2, x2; \
238 vpshufb rmask, x3, x3; \
239 vpxor (0*4*4)(out), x0, x0; \
240 vmovdqu x0, (0*4*4)(out); \
241 vpxor (1*4*4)(out), x1, x1; \
242 vmovdqu x1, (1*4*4)(out); \
243 vpxor (2*4*4)(out), x2, x2; \
244 vmovdqu x2, (2*4*4)(out); \
245 vpxor (3*4*4)(out), x3, x3; \
246 vmovdqu x3, (3*4*4)(out);
252 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
253 .Lrkr_enc_Q_Q_QBAR_QBAR:
254 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
255 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
256 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
258 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
259 .Lrkr_dec_Q_Q_QBAR_QBAR:
260 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
261 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
262 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
273 .global __cast6_enc_blk_8way
274 .type __cast6_enc_blk_8way,@function;
276 __cast6_enc_blk_8way:
281 * %rcx: bool, if true: xor output
288 vmovdqa .Lbswap_mask, RKM;
289 vmovd .Lfirst_mask, R1ST;
290 vmovd .L32_mask, R32;
292 leaq (4*4*4)(%rdx), %rax;
293 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
294 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
298 preload_rkr(0, dummy, none);
303 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
308 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
318 vmovdqa .Lbswap_mask, RKM;
319 leaq (4*4*4)(%r11), %rax;
324 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
325 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
330 outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
331 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
336 .global cast6_dec_blk_8way
337 .type cast6_dec_blk_8way,@function;
349 vmovdqa .Lbswap_mask, RKM;
350 vmovd .Lfirst_mask, R1ST;
351 vmovd .L32_mask, R32;
353 leaq (4*4*4)(%rdx), %rax;
354 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
355 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
359 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
364 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
369 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
378 vmovdqa .Lbswap_mask, RKM;
379 leaq (4*4*4)(%r11), %rax;
380 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
381 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);