x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line (f317820c) · Commits · e / devices / android_kernel_fairphone_FP5

arch/x86/include/asm/xor.h

+172 −0

Original line number	Diff line number	Diff line
		@@ -58,6 +58,14 @@
		#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
		#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
		#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
		#define NOP(x)

		#define BLK64(pf, op, i) \
		pf(i) \
		op(i, 0) \
		op(i + 1, 1) \
		op(i + 2, 2) \
		op(i + 3, 3)

		static void
		xor_sse_2(unsigned long bytes, unsigned long p1, unsigned long p2)
		@@ -110,6 +118,40 @@ xor_sse_2(unsigned long bytes, unsigned long p1, unsigned long p2)
		kernel_fpu_end();
		}

		static void
		xor_sse_2_pf64(unsigned long bytes, unsigned long p1, unsigned long p2)
		{
		unsigned long lines = bytes >> 8;

		kernel_fpu_begin();

		asm volatile(
		#undef BLOCK
		#define BLOCK(i) \
		BLK64(PF0, LD, i) \
		BLK64(PF1, XO1, i) \
		BLK64(NOP, ST, i) \

		" .align 32 ;\n"
		" 1: ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

		" add %[inc], %[p1] ;\n"
		" add %[inc], %[p2] ;\n"
		" dec %[cnt] ;\n"
		" jnz 1b ;\n"
		: [cnt] "+r" (lines),
		[p1] "+r" (p1), [p2] "+r" (p2)
		: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
		: "memory");

		kernel_fpu_end();
		}

		static void
		xor_sse_3(unsigned long bytes, unsigned long p1, unsigned long p2,
		unsigned long *p3)
		@@ -169,6 +211,43 @@ xor_sse_3(unsigned long bytes, unsigned long p1, unsigned long p2,
		kernel_fpu_end();
		}

		static void
		xor_sse_3_pf64(unsigned long bytes, unsigned long p1, unsigned long p2,
		unsigned long *p3)
		{
		unsigned long lines = bytes >> 8;

		kernel_fpu_begin();

		asm volatile(
		#undef BLOCK
		#define BLOCK(i) \
		BLK64(PF0, LD, i) \
		BLK64(PF1, XO1, i) \
		BLK64(PF2, XO2, i) \
		BLK64(NOP, ST, i) \

		" .align 32 ;\n"
		" 1: ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

		" add %[inc], %[p1] ;\n"
		" add %[inc], %[p2] ;\n"
		" add %[inc], %[p3] ;\n"
		" dec %[cnt] ;\n"
		" jnz 1b ;\n"
		: [cnt] "+r" (lines),
		[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
		: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
		: "memory");

		kernel_fpu_end();
		}

		static void
		xor_sse_4(unsigned long bytes, unsigned long p1, unsigned long p2,
		unsigned long p3, unsigned long p4)
		@@ -235,6 +314,45 @@ xor_sse_4(unsigned long bytes, unsigned long p1, unsigned long p2,
		kernel_fpu_end();
		}

		static void
		xor_sse_4_pf64(unsigned long bytes, unsigned long p1, unsigned long p2,
		unsigned long p3, unsigned long p4)
		{
		unsigned long lines = bytes >> 8;

		kernel_fpu_begin();

		asm volatile(
		#undef BLOCK
		#define BLOCK(i) \
		BLK64(PF0, LD, i) \
		BLK64(PF1, XO1, i) \
		BLK64(PF2, XO2, i) \
		BLK64(PF3, XO3, i) \
		BLK64(NOP, ST, i) \

		" .align 32 ;\n"
		" 1: ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

		" add %[inc], %[p1] ;\n"
		" add %[inc], %[p2] ;\n"
		" add %[inc], %[p3] ;\n"
		" add %[inc], %[p4] ;\n"
		" dec %[cnt] ;\n"
		" jnz 1b ;\n"
		: [cnt] "+r" (lines), [p1] "+r" (p1),
		[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
		: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
		: "memory");

		kernel_fpu_end();
		}

		static void
		xor_sse_5(unsigned long bytes, unsigned long p1, unsigned long p2,
		unsigned long p3, unsigned long p4, unsigned long *p5)
		@@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long p1, unsigned long p2,
		kernel_fpu_end();
		}

		static void
		xor_sse_5_pf64(unsigned long bytes, unsigned long p1, unsigned long p2,
		unsigned long p3, unsigned long p4, unsigned long *p5)
		{
		unsigned long lines = bytes >> 8;

		kernel_fpu_begin();

		asm volatile(
		#undef BLOCK
		#define BLOCK(i) \
		BLK64(PF0, LD, i) \
		BLK64(PF1, XO1, i) \
		BLK64(PF2, XO2, i) \
		BLK64(PF3, XO3, i) \
		BLK64(PF4, XO4, i) \
		BLK64(NOP, ST, i) \

		" .align 32 ;\n"
		" 1: ;\n"

		BLOCK(0)
		BLOCK(4)
		BLOCK(8)
		BLOCK(12)

		" add %[inc], %[p1] ;\n"
		" add %[inc], %[p2] ;\n"
		" add %[inc], %[p3] ;\n"
		" add %[inc], %[p4] ;\n"
		" add %[inc], %[p5] ;\n"
		" dec %[cnt] ;\n"
		" jnz 1b ;\n"
		: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
		[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
		: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
		: "memory");

		kernel_fpu_end();
		}

		static struct xor_block_template xor_block_sse_pf64 = {
		.name = "prefetch64-sse",
		.do_2 = xor_sse_2_pf64,
		.do_3 = xor_sse_3_pf64,
		.do_4 = xor_sse_4_pf64,
		.do_5 = xor_sse_5_pf64,
		};

		#undef LD
		#undef XO1
		#undef XO2
		#undef XO3
		#undef XO4
		#undef ST
		#undef NOP
		#undef BLK64
		#undef BLOCK

		#undef XOR_CONSTANT_CONSTRAINT
		@@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long p1, unsigned long p2,
		# include <asm/xor_64.h>
		#endif

		#define XOR_SELECT_TEMPLATE(FASTEST) \
		AVX_SELECT(FASTEST)

		#endif /* _ASM_X86_XOR_H */

arch/x86/include/asm/xor_32.h

+11 −12

Original line number	Diff line number	Diff line
		@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
		/* Also try the generic routines. */
		#include <asm-generic/xor.h>

		/* We force the use of the SSE xor block because it can write around L2.
		We may also be able to load into the L1 only depending on how the cpu
		deals with a load to a line that is being prefetched. */
		#undef XOR_TRY_TEMPLATES
		#define XOR_TRY_TEMPLATES \
		do { \
		xor_speed(&xor_block_8regs); \
		xor_speed(&xor_block_8regs_p); \
		xor_speed(&xor_block_32regs); \
		xor_speed(&xor_block_32regs_p); \
		AVX_XOR_SPEED; \
		if (cpu_has_xmm) \
		if (cpu_has_xmm) { \
		xor_speed(&xor_block_pIII_sse); \
		if (cpu_has_mmx) { \
		xor_speed(&xor_block_sse_pf64); \
		} else if (cpu_has_mmx) { \
		xor_speed(&xor_block_pII_mmx); \
		xor_speed(&xor_block_p5_mmx); \
		} else { \
		xor_speed(&xor_block_8regs); \
		xor_speed(&xor_block_8regs_p); \
		xor_speed(&xor_block_32regs); \
		xor_speed(&xor_block_32regs_p); \
		} \
		} while (0)

		/* We force the use of the SSE xor block because it can write around L2.
		We may also be able to load into the L1 only depending on how the cpu
		deals with a load to a line that is being prefetched. */
		#define XOR_SELECT_TEMPLATE(FASTEST) \
		AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)

		#endif /* _ASM_X86_XOR_32_H */

arch/x86/include/asm/xor_64.h

+4 −6

Original line number	Diff line number	Diff line
		@@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
		/* Also try the AVX routines */
		#include <asm/xor_avx.h>

		/* We force the use of the SSE xor block because it can write around L2.
		We may also be able to load into the L1 only depending on how the cpu
		deals with a load to a line that is being prefetched. */
		#undef XOR_TRY_TEMPLATES
		#define XOR_TRY_TEMPLATES \
		do { \
		AVX_XOR_SPEED; \
		xor_speed(&xor_block_sse_pf64); \
		xor_speed(&xor_block_sse); \
		} while (0)

		/* We force the use of the SSE xor block because it can write around L2.
		We may also be able to load into the L1 only depending on how the cpu
		deals with a load to a line that is being prefetched. */
		#define XOR_SELECT_TEMPLATE(FASTEST) \
		AVX_SELECT(&xor_block_sse)

		#endif /* _ASM_X86_XOR_64_H */