]> Pileus Git - ~andy/linux/blob - arch/x86/include/asm/xor_32.h
Merge tag 'byteswap-for-linus-20121219' of git://git.infradead.org/users/dwmw2/byteswap
[~andy/linux] / arch / x86 / include / asm / xor_32.h
1 #ifndef _ASM_X86_XOR_32_H
2 #define _ASM_X86_XOR_32_H
3
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16
17 /*
18  * High-speed RAID5 checksumming functions utilizing MMX instructions.
19  * Copyright (C) 1998 Ingo Molnar.
20  */
21
22 #define LD(x, y)        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
23 #define ST(x, y)        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
24 #define XO1(x, y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
25 #define XO2(x, y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
26 #define XO3(x, y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
27 #define XO4(x, y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
28
29 #include <asm/i387.h>
30
31 static void
32 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33 {
34         unsigned long lines = bytes >> 7;
35
36         kernel_fpu_begin();
37
38         asm volatile(
39 #undef BLOCK
40 #define BLOCK(i)                                \
41         LD(i, 0)                                \
42                 LD(i + 1, 1)                    \
43                         LD(i + 2, 2)            \
44                                 LD(i + 3, 3)    \
45         XO1(i, 0)                               \
46         ST(i, 0)                                \
47                 XO1(i+1, 1)                     \
48                 ST(i+1, 1)                      \
49                         XO1(i + 2, 2)           \
50                         ST(i + 2, 2)            \
51                                 XO1(i + 3, 3)   \
52                                 ST(i + 3, 3)
53
54         " .align 32                     ;\n"
55         " 1:                            ;\n"
56
57         BLOCK(0)
58         BLOCK(4)
59         BLOCK(8)
60         BLOCK(12)
61
62         "       addl $128, %1         ;\n"
63         "       addl $128, %2         ;\n"
64         "       decl %0               ;\n"
65         "       jnz 1b                ;\n"
66         : "+r" (lines),
67           "+r" (p1), "+r" (p2)
68         :
69         : "memory");
70
71         kernel_fpu_end();
72 }
73
74 static void
75 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76               unsigned long *p3)
77 {
78         unsigned long lines = bytes >> 7;
79
80         kernel_fpu_begin();
81
82         asm volatile(
83 #undef BLOCK
84 #define BLOCK(i)                                \
85         LD(i, 0)                                \
86                 LD(i + 1, 1)                    \
87                         LD(i + 2, 2)            \
88                                 LD(i + 3, 3)    \
89         XO1(i, 0)                               \
90                 XO1(i + 1, 1)                   \
91                         XO1(i + 2, 2)           \
92                                 XO1(i + 3, 3)   \
93         XO2(i, 0)                               \
94         ST(i, 0)                                \
95                 XO2(i + 1, 1)                   \
96                 ST(i + 1, 1)                    \
97                         XO2(i + 2, 2)           \
98                         ST(i + 2, 2)            \
99                                 XO2(i + 3, 3)   \
100                                 ST(i + 3, 3)
101
102         " .align 32                     ;\n"
103         " 1:                            ;\n"
104
105         BLOCK(0)
106         BLOCK(4)
107         BLOCK(8)
108         BLOCK(12)
109
110         "       addl $128, %1         ;\n"
111         "       addl $128, %2         ;\n"
112         "       addl $128, %3         ;\n"
113         "       decl %0               ;\n"
114         "       jnz 1b                ;\n"
115         : "+r" (lines),
116           "+r" (p1), "+r" (p2), "+r" (p3)
117         :
118         : "memory");
119
120         kernel_fpu_end();
121 }
122
123 static void
124 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125               unsigned long *p3, unsigned long *p4)
126 {
127         unsigned long lines = bytes >> 7;
128
129         kernel_fpu_begin();
130
131         asm volatile(
132 #undef BLOCK
133 #define BLOCK(i)                                \
134         LD(i, 0)                                \
135                 LD(i + 1, 1)                    \
136                         LD(i + 2, 2)            \
137                                 LD(i + 3, 3)    \
138         XO1(i, 0)                               \
139                 XO1(i + 1, 1)                   \
140                         XO1(i + 2, 2)           \
141                                 XO1(i + 3, 3)   \
142         XO2(i, 0)                               \
143                 XO2(i + 1, 1)                   \
144                         XO2(i + 2, 2)           \
145                                 XO2(i + 3, 3)   \
146         XO3(i, 0)                               \
147         ST(i, 0)                                \
148                 XO3(i + 1, 1)                   \
149                 ST(i + 1, 1)                    \
150                         XO3(i + 2, 2)           \
151                         ST(i + 2, 2)            \
152                                 XO3(i + 3, 3)   \
153                                 ST(i + 3, 3)
154
155         " .align 32                     ;\n"
156         " 1:                            ;\n"
157
158         BLOCK(0)
159         BLOCK(4)
160         BLOCK(8)
161         BLOCK(12)
162
163         "       addl $128, %1         ;\n"
164         "       addl $128, %2         ;\n"
165         "       addl $128, %3         ;\n"
166         "       addl $128, %4         ;\n"
167         "       decl %0               ;\n"
168         "       jnz 1b                ;\n"
169         : "+r" (lines),
170           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171         :
172         : "memory");
173
174         kernel_fpu_end();
175 }
176
177
178 static void
179 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180               unsigned long *p3, unsigned long *p4, unsigned long *p5)
181 {
182         unsigned long lines = bytes >> 7;
183
184         kernel_fpu_begin();
185
186         /* Make sure GCC forgets anything it knows about p4 or p5,
187            such that it won't pass to the asm volatile below a
188            register that is shared with any other variable.  That's
189            because we modify p4 and p5 there, but we can't mark them
190            as read/write, otherwise we'd overflow the 10-asm-operands
191            limit of GCC < 3.1.  */
192         asm("" : "+r" (p4), "+r" (p5));
193
194         asm volatile(
195 #undef BLOCK
196 #define BLOCK(i)                                \
197         LD(i, 0)                                \
198                 LD(i + 1, 1)                    \
199                         LD(i + 2, 2)            \
200                                 LD(i + 3, 3)    \
201         XO1(i, 0)                               \
202                 XO1(i + 1, 1)                   \
203                         XO1(i + 2, 2)           \
204                                 XO1(i + 3, 3)   \
205         XO2(i, 0)                               \
206                 XO2(i + 1, 1)                   \
207                         XO2(i + 2, 2)           \
208                                 XO2(i + 3, 3)   \
209         XO3(i, 0)                               \
210                 XO3(i + 1, 1)                   \
211                         XO3(i + 2, 2)           \
212                                 XO3(i + 3, 3)   \
213         XO4(i, 0)                               \
214         ST(i, 0)                                \
215                 XO4(i + 1, 1)                   \
216                 ST(i + 1, 1)                    \
217                         XO4(i + 2, 2)           \
218                         ST(i + 2, 2)            \
219                                 XO4(i + 3, 3)   \
220                                 ST(i + 3, 3)
221
222         " .align 32                     ;\n"
223         " 1:                            ;\n"
224
225         BLOCK(0)
226         BLOCK(4)
227         BLOCK(8)
228         BLOCK(12)
229
230         "       addl $128, %1         ;\n"
231         "       addl $128, %2         ;\n"
232         "       addl $128, %3         ;\n"
233         "       addl $128, %4         ;\n"
234         "       addl $128, %5         ;\n"
235         "       decl %0               ;\n"
236         "       jnz 1b                ;\n"
237         : "+r" (lines),
238           "+r" (p1), "+r" (p2), "+r" (p3)
239         : "r" (p4), "r" (p5)
240         : "memory");
241
242         /* p4 and p5 were modified, and now the variables are dead.
243            Clobber them just to be sure nobody does something stupid
244            like assuming they have some legal value.  */
245         asm("" : "=r" (p4), "=r" (p5));
246
247         kernel_fpu_end();
248 }
249
250 #undef LD
251 #undef XO1
252 #undef XO2
253 #undef XO3
254 #undef XO4
255 #undef ST
256 #undef BLOCK
257
258 static void
259 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260 {
261         unsigned long lines = bytes >> 6;
262
263         kernel_fpu_begin();
264
265         asm volatile(
266         " .align 32                  ;\n"
267         " 1:                         ;\n"
268         "       movq   (%1), %%mm0   ;\n"
269         "       movq  8(%1), %%mm1   ;\n"
270         "       pxor   (%2), %%mm0   ;\n"
271         "       movq 16(%1), %%mm2   ;\n"
272         "       movq %%mm0,   (%1)   ;\n"
273         "       pxor  8(%2), %%mm1   ;\n"
274         "       movq 24(%1), %%mm3   ;\n"
275         "       movq %%mm1,  8(%1)   ;\n"
276         "       pxor 16(%2), %%mm2   ;\n"
277         "       movq 32(%1), %%mm4   ;\n"
278         "       movq %%mm2, 16(%1)   ;\n"
279         "       pxor 24(%2), %%mm3   ;\n"
280         "       movq 40(%1), %%mm5   ;\n"
281         "       movq %%mm3, 24(%1)   ;\n"
282         "       pxor 32(%2), %%mm4   ;\n"
283         "       movq 48(%1), %%mm6   ;\n"
284         "       movq %%mm4, 32(%1)   ;\n"
285         "       pxor 40(%2), %%mm5   ;\n"
286         "       movq 56(%1), %%mm7   ;\n"
287         "       movq %%mm5, 40(%1)   ;\n"
288         "       pxor 48(%2), %%mm6   ;\n"
289         "       pxor 56(%2), %%mm7   ;\n"
290         "       movq %%mm6, 48(%1)   ;\n"
291         "       movq %%mm7, 56(%1)   ;\n"
292
293         "       addl $64, %1         ;\n"
294         "       addl $64, %2         ;\n"
295         "       decl %0              ;\n"
296         "       jnz 1b               ;\n"
297         : "+r" (lines),
298           "+r" (p1), "+r" (p2)
299         :
300         : "memory");
301
302         kernel_fpu_end();
303 }
304
305 static void
306 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307              unsigned long *p3)
308 {
309         unsigned long lines = bytes >> 6;
310
311         kernel_fpu_begin();
312
313         asm volatile(
314         " .align 32,0x90             ;\n"
315         " 1:                         ;\n"
316         "       movq   (%1), %%mm0   ;\n"
317         "       movq  8(%1), %%mm1   ;\n"
318         "       pxor   (%2), %%mm0   ;\n"
319         "       movq 16(%1), %%mm2   ;\n"
320         "       pxor  8(%2), %%mm1   ;\n"
321         "       pxor   (%3), %%mm0   ;\n"
322         "       pxor 16(%2), %%mm2   ;\n"
323         "       movq %%mm0,   (%1)   ;\n"
324         "       pxor  8(%3), %%mm1   ;\n"
325         "       pxor 16(%3), %%mm2   ;\n"
326         "       movq 24(%1), %%mm3   ;\n"
327         "       movq %%mm1,  8(%1)   ;\n"
328         "       movq 32(%1), %%mm4   ;\n"
329         "       movq 40(%1), %%mm5   ;\n"
330         "       pxor 24(%2), %%mm3   ;\n"
331         "       movq %%mm2, 16(%1)   ;\n"
332         "       pxor 32(%2), %%mm4   ;\n"
333         "       pxor 24(%3), %%mm3   ;\n"
334         "       pxor 40(%2), %%mm5   ;\n"
335         "       movq %%mm3, 24(%1)   ;\n"
336         "       pxor 32(%3), %%mm4   ;\n"
337         "       pxor 40(%3), %%mm5   ;\n"
338         "       movq 48(%1), %%mm6   ;\n"
339         "       movq %%mm4, 32(%1)   ;\n"
340         "       movq 56(%1), %%mm7   ;\n"
341         "       pxor 48(%2), %%mm6   ;\n"
342         "       movq %%mm5, 40(%1)   ;\n"
343         "       pxor 56(%2), %%mm7   ;\n"
344         "       pxor 48(%3), %%mm6   ;\n"
345         "       pxor 56(%3), %%mm7   ;\n"
346         "       movq %%mm6, 48(%1)   ;\n"
347         "       movq %%mm7, 56(%1)   ;\n"
348
349         "       addl $64, %1         ;\n"
350         "       addl $64, %2         ;\n"
351         "       addl $64, %3         ;\n"
352         "       decl %0              ;\n"
353         "       jnz 1b               ;\n"
354         : "+r" (lines),
355           "+r" (p1), "+r" (p2), "+r" (p3)
356         :
357         : "memory" );
358
359         kernel_fpu_end();
360 }
361
362 static void
363 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364              unsigned long *p3, unsigned long *p4)
365 {
366         unsigned long lines = bytes >> 6;
367
368         kernel_fpu_begin();
369
370         asm volatile(
371         " .align 32,0x90             ;\n"
372         " 1:                         ;\n"
373         "       movq   (%1), %%mm0   ;\n"
374         "       movq  8(%1), %%mm1   ;\n"
375         "       pxor   (%2), %%mm0   ;\n"
376         "       movq 16(%1), %%mm2   ;\n"
377         "       pxor  8(%2), %%mm1   ;\n"
378         "       pxor   (%3), %%mm0   ;\n"
379         "       pxor 16(%2), %%mm2   ;\n"
380         "       pxor  8(%3), %%mm1   ;\n"
381         "       pxor   (%4), %%mm0   ;\n"
382         "       movq 24(%1), %%mm3   ;\n"
383         "       pxor 16(%3), %%mm2   ;\n"
384         "       pxor  8(%4), %%mm1   ;\n"
385         "       movq %%mm0,   (%1)   ;\n"
386         "       movq 32(%1), %%mm4   ;\n"
387         "       pxor 24(%2), %%mm3   ;\n"
388         "       pxor 16(%4), %%mm2   ;\n"
389         "       movq %%mm1,  8(%1)   ;\n"
390         "       movq 40(%1), %%mm5   ;\n"
391         "       pxor 32(%2), %%mm4   ;\n"
392         "       pxor 24(%3), %%mm3   ;\n"
393         "       movq %%mm2, 16(%1)   ;\n"
394         "       pxor 40(%2), %%mm5   ;\n"
395         "       pxor 32(%3), %%mm4   ;\n"
396         "       pxor 24(%4), %%mm3   ;\n"
397         "       movq %%mm3, 24(%1)   ;\n"
398         "       movq 56(%1), %%mm7   ;\n"
399         "       movq 48(%1), %%mm6   ;\n"
400         "       pxor 40(%3), %%mm5   ;\n"
401         "       pxor 32(%4), %%mm4   ;\n"
402         "       pxor 48(%2), %%mm6   ;\n"
403         "       movq %%mm4, 32(%1)   ;\n"
404         "       pxor 56(%2), %%mm7   ;\n"
405         "       pxor 40(%4), %%mm5   ;\n"
406         "       pxor 48(%3), %%mm6   ;\n"
407         "       pxor 56(%3), %%mm7   ;\n"
408         "       movq %%mm5, 40(%1)   ;\n"
409         "       pxor 48(%4), %%mm6   ;\n"
410         "       pxor 56(%4), %%mm7   ;\n"
411         "       movq %%mm6, 48(%1)   ;\n"
412         "       movq %%mm7, 56(%1)   ;\n"
413
414         "       addl $64, %1         ;\n"
415         "       addl $64, %2         ;\n"
416         "       addl $64, %3         ;\n"
417         "       addl $64, %4         ;\n"
418         "       decl %0              ;\n"
419         "       jnz 1b               ;\n"
420         : "+r" (lines),
421           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422         :
423         : "memory");
424
425         kernel_fpu_end();
426 }
427
428 static void
429 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430              unsigned long *p3, unsigned long *p4, unsigned long *p5)
431 {
432         unsigned long lines = bytes >> 6;
433
434         kernel_fpu_begin();
435
436         /* Make sure GCC forgets anything it knows about p4 or p5,
437            such that it won't pass to the asm volatile below a
438            register that is shared with any other variable.  That's
439            because we modify p4 and p5 there, but we can't mark them
440            as read/write, otherwise we'd overflow the 10-asm-operands
441            limit of GCC < 3.1.  */
442         asm("" : "+r" (p4), "+r" (p5));
443
444         asm volatile(
445         " .align 32,0x90             ;\n"
446         " 1:                         ;\n"
447         "       movq   (%1), %%mm0   ;\n"
448         "       movq  8(%1), %%mm1   ;\n"
449         "       pxor   (%2), %%mm0   ;\n"
450         "       pxor  8(%2), %%mm1   ;\n"
451         "       movq 16(%1), %%mm2   ;\n"
452         "       pxor   (%3), %%mm0   ;\n"
453         "       pxor  8(%3), %%mm1   ;\n"
454         "       pxor 16(%2), %%mm2   ;\n"
455         "       pxor   (%4), %%mm0   ;\n"
456         "       pxor  8(%4), %%mm1   ;\n"
457         "       pxor 16(%3), %%mm2   ;\n"
458         "       movq 24(%1), %%mm3   ;\n"
459         "       pxor   (%5), %%mm0   ;\n"
460         "       pxor  8(%5), %%mm1   ;\n"
461         "       movq %%mm0,   (%1)   ;\n"
462         "       pxor 16(%4), %%mm2   ;\n"
463         "       pxor 24(%2), %%mm3   ;\n"
464         "       movq %%mm1,  8(%1)   ;\n"
465         "       pxor 16(%5), %%mm2   ;\n"
466         "       pxor 24(%3), %%mm3   ;\n"
467         "       movq 32(%1), %%mm4   ;\n"
468         "       movq %%mm2, 16(%1)   ;\n"
469         "       pxor 24(%4), %%mm3   ;\n"
470         "       pxor 32(%2), %%mm4   ;\n"
471         "       movq 40(%1), %%mm5   ;\n"
472         "       pxor 24(%5), %%mm3   ;\n"
473         "       pxor 32(%3), %%mm4   ;\n"
474         "       pxor 40(%2), %%mm5   ;\n"
475         "       movq %%mm3, 24(%1)   ;\n"
476         "       pxor 32(%4), %%mm4   ;\n"
477         "       pxor 40(%3), %%mm5   ;\n"
478         "       movq 48(%1), %%mm6   ;\n"
479         "       movq 56(%1), %%mm7   ;\n"
480         "       pxor 32(%5), %%mm4   ;\n"
481         "       pxor 40(%4), %%mm5   ;\n"
482         "       pxor 48(%2), %%mm6   ;\n"
483         "       pxor 56(%2), %%mm7   ;\n"
484         "       movq %%mm4, 32(%1)   ;\n"
485         "       pxor 48(%3), %%mm6   ;\n"
486         "       pxor 56(%3), %%mm7   ;\n"
487         "       pxor 40(%5), %%mm5   ;\n"
488         "       pxor 48(%4), %%mm6   ;\n"
489         "       pxor 56(%4), %%mm7   ;\n"
490         "       movq %%mm5, 40(%1)   ;\n"
491         "       pxor 48(%5), %%mm6   ;\n"
492         "       pxor 56(%5), %%mm7   ;\n"
493         "       movq %%mm6, 48(%1)   ;\n"
494         "       movq %%mm7, 56(%1)   ;\n"
495
496         "       addl $64, %1         ;\n"
497         "       addl $64, %2         ;\n"
498         "       addl $64, %3         ;\n"
499         "       addl $64, %4         ;\n"
500         "       addl $64, %5         ;\n"
501         "       decl %0              ;\n"
502         "       jnz 1b               ;\n"
503         : "+r" (lines),
504           "+r" (p1), "+r" (p2), "+r" (p3)
505         : "r" (p4), "r" (p5)
506         : "memory");
507
508         /* p4 and p5 were modified, and now the variables are dead.
509            Clobber them just to be sure nobody does something stupid
510            like assuming they have some legal value.  */
511         asm("" : "=r" (p4), "=r" (p5));
512
513         kernel_fpu_end();
514 }
515
516 static struct xor_block_template xor_block_pII_mmx = {
517         .name = "pII_mmx",
518         .do_2 = xor_pII_mmx_2,
519         .do_3 = xor_pII_mmx_3,
520         .do_4 = xor_pII_mmx_4,
521         .do_5 = xor_pII_mmx_5,
522 };
523
524 static struct xor_block_template xor_block_p5_mmx = {
525         .name = "p5_mmx",
526         .do_2 = xor_p5_mmx_2,
527         .do_3 = xor_p5_mmx_3,
528         .do_4 = xor_p5_mmx_4,
529         .do_5 = xor_p5_mmx_5,
530 };
531
532 /*
533  * Cache avoiding checksumming functions utilizing KNI instructions
534  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535  */
536
537 #define OFFS(x)         "16*("#x")"
538 #define PF_OFFS(x)      "256+16*("#x")"
539 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
540 #define LD(x, y)        "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
541 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
542 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
543 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
544 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
545 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
546 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
547 #define XO1(x, y)       "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
548 #define XO2(x, y)       "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
549 #define XO3(x, y)       "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
550 #define XO4(x, y)       "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
551 #define XO5(x, y)       "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
552
553
554 static void
555 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556 {
557         unsigned long lines = bytes >> 8;
558
559         kernel_fpu_begin();
560
561         asm volatile(
562 #undef BLOCK
563 #define BLOCK(i)                                        \
564                 LD(i, 0)                                \
565                         LD(i + 1, 1)                    \
566                 PF1(i)                                  \
567                                 PF1(i + 2)              \
568                                 LD(i + 2, 2)            \
569                                         LD(i + 3, 3)    \
570                 PF0(i + 4)                              \
571                                 PF0(i + 6)              \
572                 XO1(i, 0)                               \
573                         XO1(i + 1, 1)                   \
574                                 XO1(i + 2, 2)           \
575                                         XO1(i + 3, 3)   \
576                 ST(i, 0)                                \
577                         ST(i + 1, 1)                    \
578                                 ST(i + 2, 2)            \
579                                         ST(i + 3, 3)    \
580
581
582                 PF0(0)
583                                 PF0(2)
584
585         " .align 32                     ;\n"
586         " 1:                            ;\n"
587
588                 BLOCK(0)
589                 BLOCK(4)
590                 BLOCK(8)
591                 BLOCK(12)
592
593         "       addl $256, %1           ;\n"
594         "       addl $256, %2           ;\n"
595         "       decl %0                 ;\n"
596         "       jnz 1b                  ;\n"
597         : "+r" (lines),
598           "+r" (p1), "+r" (p2)
599         :
600         : "memory");
601
602         kernel_fpu_end();
603 }
604
605 static void
606 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607           unsigned long *p3)
608 {
609         unsigned long lines = bytes >> 8;
610
611         kernel_fpu_begin();
612
613         asm volatile(
614 #undef BLOCK
615 #define BLOCK(i) \
616                 PF1(i)                                  \
617                                 PF1(i + 2)              \
618                 LD(i,0)                                 \
619                         LD(i + 1, 1)                    \
620                                 LD(i + 2, 2)            \
621                                         LD(i + 3, 3)    \
622                 PF2(i)                                  \
623                                 PF2(i + 2)              \
624                 PF0(i + 4)                              \
625                                 PF0(i + 6)              \
626                 XO1(i,0)                                \
627                         XO1(i + 1, 1)                   \
628                                 XO1(i + 2, 2)           \
629                                         XO1(i + 3, 3)   \
630                 XO2(i,0)                                \
631                         XO2(i + 1, 1)                   \
632                                 XO2(i + 2, 2)           \
633                                         XO2(i + 3, 3)   \
634                 ST(i,0)                                 \
635                         ST(i + 1, 1)                    \
636                                 ST(i + 2, 2)            \
637                                         ST(i + 3, 3)    \
638
639
640                 PF0(0)
641                                 PF0(2)
642
643         " .align 32                     ;\n"
644         " 1:                            ;\n"
645
646                 BLOCK(0)
647                 BLOCK(4)
648                 BLOCK(8)
649                 BLOCK(12)
650
651         "       addl $256, %1           ;\n"
652         "       addl $256, %2           ;\n"
653         "       addl $256, %3           ;\n"
654         "       decl %0                 ;\n"
655         "       jnz 1b                  ;\n"
656         : "+r" (lines),
657           "+r" (p1), "+r"(p2), "+r"(p3)
658         :
659         : "memory" );
660
661         kernel_fpu_end();
662 }
663
664 static void
665 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666           unsigned long *p3, unsigned long *p4)
667 {
668         unsigned long lines = bytes >> 8;
669
670         kernel_fpu_begin();
671
672         asm volatile(
673 #undef BLOCK
674 #define BLOCK(i) \
675                 PF1(i)                                  \
676                                 PF1(i + 2)              \
677                 LD(i,0)                                 \
678                         LD(i + 1, 1)                    \
679                                 LD(i + 2, 2)            \
680                                         LD(i + 3, 3)    \
681                 PF2(i)                                  \
682                                 PF2(i + 2)              \
683                 XO1(i,0)                                \
684                         XO1(i + 1, 1)                   \
685                                 XO1(i + 2, 2)           \
686                                         XO1(i + 3, 3)   \
687                 PF3(i)                                  \
688                                 PF3(i + 2)              \
689                 PF0(i + 4)                              \
690                                 PF0(i + 6)              \
691                 XO2(i,0)                                \
692                         XO2(i + 1, 1)                   \
693                                 XO2(i + 2, 2)           \
694                                         XO2(i + 3, 3)   \
695                 XO3(i,0)                                \
696                         XO3(i + 1, 1)                   \
697                                 XO3(i + 2, 2)           \
698                                         XO3(i + 3, 3)   \
699                 ST(i,0)                                 \
700                         ST(i + 1, 1)                    \
701                                 ST(i + 2, 2)            \
702                                         ST(i + 3, 3)    \
703
704
705                 PF0(0)
706                                 PF0(2)
707
708         " .align 32                     ;\n"
709         " 1:                            ;\n"
710
711                 BLOCK(0)
712                 BLOCK(4)
713                 BLOCK(8)
714                 BLOCK(12)
715
716         "       addl $256, %1           ;\n"
717         "       addl $256, %2           ;\n"
718         "       addl $256, %3           ;\n"
719         "       addl $256, %4           ;\n"
720         "       decl %0                 ;\n"
721         "       jnz 1b                  ;\n"
722         : "+r" (lines),
723           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724         :
725         : "memory" );
726
727         kernel_fpu_end();
728 }
729
730 static void
731 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732           unsigned long *p3, unsigned long *p4, unsigned long *p5)
733 {
734         unsigned long lines = bytes >> 8;
735
736         kernel_fpu_begin();
737
738         /* Make sure GCC forgets anything it knows about p4 or p5,
739            such that it won't pass to the asm volatile below a
740            register that is shared with any other variable.  That's
741            because we modify p4 and p5 there, but we can't mark them
742            as read/write, otherwise we'd overflow the 10-asm-operands
743            limit of GCC < 3.1.  */
744         asm("" : "+r" (p4), "+r" (p5));
745
746         asm volatile(
747 #undef BLOCK
748 #define BLOCK(i) \
749                 PF1(i)                                  \
750                                 PF1(i + 2)              \
751                 LD(i,0)                                 \
752                         LD(i + 1, 1)                    \
753                                 LD(i + 2, 2)            \
754                                         LD(i + 3, 3)    \
755                 PF2(i)                                  \
756                                 PF2(i + 2)              \
757                 XO1(i,0)                                \
758                         XO1(i + 1, 1)                   \
759                                 XO1(i + 2, 2)           \
760                                         XO1(i + 3, 3)   \
761                 PF3(i)                                  \
762                                 PF3(i + 2)              \
763                 XO2(i,0)                                \
764                         XO2(i + 1, 1)                   \
765                                 XO2(i + 2, 2)           \
766                                         XO2(i + 3, 3)   \
767                 PF4(i)                                  \
768                                 PF4(i + 2)              \
769                 PF0(i + 4)                              \
770                                 PF0(i + 6)              \
771                 XO3(i,0)                                \
772                         XO3(i + 1, 1)                   \
773                                 XO3(i + 2, 2)           \
774                                         XO3(i + 3, 3)   \
775                 XO4(i,0)                                \
776                         XO4(i + 1, 1)                   \
777                                 XO4(i + 2, 2)           \
778                                         XO4(i + 3, 3)   \
779                 ST(i,0)                                 \
780                         ST(i + 1, 1)                    \
781                                 ST(i + 2, 2)            \
782                                         ST(i + 3, 3)    \
783
784
785                 PF0(0)
786                                 PF0(2)
787
788         " .align 32                     ;\n"
789         " 1:                            ;\n"
790
791                 BLOCK(0)
792                 BLOCK(4)
793                 BLOCK(8)
794                 BLOCK(12)
795
796         "       addl $256, %1           ;\n"
797         "       addl $256, %2           ;\n"
798         "       addl $256, %3           ;\n"
799         "       addl $256, %4           ;\n"
800         "       addl $256, %5           ;\n"
801         "       decl %0                 ;\n"
802         "       jnz 1b                  ;\n"
803         : "+r" (lines),
804           "+r" (p1), "+r" (p2), "+r" (p3)
805         : "r" (p4), "r" (p5)
806         : "memory");
807
808         /* p4 and p5 were modified, and now the variables are dead.
809            Clobber them just to be sure nobody does something stupid
810            like assuming they have some legal value.  */
811         asm("" : "=r" (p4), "=r" (p5));
812
813         kernel_fpu_end();
814 }
815
816 static struct xor_block_template xor_block_pIII_sse = {
817         .name = "pIII_sse",
818         .do_2 = xor_sse_2,
819         .do_3 = xor_sse_3,
820         .do_4 = xor_sse_4,
821         .do_5 = xor_sse_5,
822 };
823
824 /* Also try the AVX routines */
825 #include <asm/xor_avx.h>
826
827 /* Also try the generic routines.  */
828 #include <asm-generic/xor.h>
829
830 #undef XOR_TRY_TEMPLATES
831 #define XOR_TRY_TEMPLATES                               \
832 do {                                                    \
833         xor_speed(&xor_block_8regs);                    \
834         xor_speed(&xor_block_8regs_p);                  \
835         xor_speed(&xor_block_32regs);                   \
836         xor_speed(&xor_block_32regs_p);                 \
837         AVX_XOR_SPEED;                                  \
838         if (cpu_has_xmm)                                \
839                 xor_speed(&xor_block_pIII_sse);         \
840         if (cpu_has_mmx) {                              \
841                 xor_speed(&xor_block_pII_mmx);          \
842                 xor_speed(&xor_block_p5_mmx);           \
843         }                                               \
844 } while (0)
845
846 /* We force the use of the SSE xor block because it can write around L2.
847    We may also be able to load into the L1 only depending on how the cpu
848    deals with a load to a line that is being prefetched.  */
849 #define XOR_SELECT_TEMPLATE(FASTEST)                    \
850         AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
851
852 #endif /* _ASM_X86_XOR_32_H */