Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e827bb09 authored by Jussi Kivilinna's avatar Jussi Kivilinna Committed by Herbert Xu
Browse files

crypto: blowfish-x86_64 - improve x86_64 blowfish 4-way performance



This patch adds improved F-macro for 4-way parallel functions. With new
F-macro for 4-way parallel functions, blowfish sees ~15% improvement in
speed tests on AMD Phenom II (~5% on Intel Xeon E7330).

However when used in 1-way blowfish function new macro would be ~10%
slower than original, so old F-macro is kept for 1-way functions.
Patch cleans up old F-macro as it is no longer needed in 4-way part.

Patch also does register macro renaming to reduce stack usage.

Signed-off-by: default avatarJussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent fad8fa47
Loading
Loading
Loading
Loading
+98 −100
Original line number Diff line number Diff line
@@ -56,38 +56,32 @@

#define RT0 %rbp
#define RT1 %rsi
#define RT2 %r8
#define RT3 %r9

#define RT0d %ebp
#define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d

#define RK0 %r8
#define RK1 %r9
#define RK2 %r10
#define RK3 %r11

#define RK0d %r8d
#define RK1d %r9d
#define RK2d %r10d
#define RK3d %r11d

#define RKEY %r12
#define RKEY %r10

/***********************************************************************
 * 1-way blowfish
 ***********************************************************************/
#define F(x, k) \
	rorq $16,		x; \
	movzbl x ## bh,		RT0d; \
	movzbl x ## bl,		RT1d; \
	rolq $16,		x; \
	movl s0(CTX,RT0,4),	k ## d; \
	addl s1(CTX,RT1,4),	k ## d; \
	movzbl x ## bh,		RT0d; \
	movzbl x ## bl,		RT1d; \
	rolq $32,		x; \
	xorl s2(CTX,RT0,4),	k ## d; \
	addl s3(CTX,RT1,4),	k ## d; \
	xorq k,			x;
#define F() \
	rorq $16,		RX0; \
	movzbl RX0bh,		RT0d; \
	movzbl RX0bl,		RT1d; \
	rolq $16,		RX0; \
	movl s0(CTX,RT0,4),	RT0d; \
	addl s1(CTX,RT1,4),	RT0d; \
	movzbl RX0bh,		RT1d; \
	movzbl RX0bl,		RT2d; \
	rolq $32,		RX0; \
	xorl s2(CTX,RT1,4),	RT0d; \
	addl s3(CTX,RT2,4),	RT0d; \
	xorq RT0,		RX0;

#define add_roundkey_enc(n) \
	xorq p+4*(n)(CTX), 	RX0;
@@ -95,11 +89,8 @@
#define round_enc(n) \
	add_roundkey_enc(n); \
	\
	F(RX0, RK0); \
	F(RX0, RK0);

#define round_final_enc(n) \
	xorq p+4*(n)(CTX), 	RX0;
	F(); \
	F();

#define add_roundkey_dec(n) \
	movq p+4*(n-1)(CTX),	RT0; \
@@ -109,8 +100,8 @@
#define round_dec(n) \
	add_roundkey_dec(n); \
	\
	F(RX0, RK0); \
	F(RX0, RK0); \
	F(); \
	F(); \

#define read_block() \
	movq (RIO), 		RX0; \
@@ -130,16 +121,15 @@
.type   __blowfish_enc_blk,@function;

__blowfish_enc_blk:
	// input:
	//	%rdi: ctx, CTX
	//	%rsi: dst
	//	%rdx: src
	//	%rcx: bool xor
	pushq %rbp;
	pushq %rbx;
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool, if true: xor output
	 */
	movq %rbp, %r11;

	pushq %rsi;
	pushq %rcx;
	movq %rsi, %r10;
	movq %rdx, RIO;

	read_block();
@@ -154,38 +144,31 @@ __blowfish_enc_blk:
	round_enc(14);
	add_roundkey_enc(16);

	popq %rbp;
	popq RIO;
	movq %r11, %rbp;

	test %bpl, %bpl;
	movq %r10, RIO;
	test %cl, %cl;
	jnz __enc_xor;

	write_block();

__enc_ret:
	popq %rbx;
	popq %rbp;

	ret;

__enc_xor:
	xor_block();

	jmp __enc_ret;
	ret;

.align 8
.global blowfish_dec_blk
.type   blowfish_dec_blk,@function;

blowfish_dec_blk:
	// input:
	//	%rdi: ctx, CTX
	//	%rsi: dst
	//	%rdx: src
	pushq %rbp;
	pushq %rbx;
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 */
	movq %rbp, %r11;

	pushq %rsi;
	movq %rsi, %r10;
	movq %rdx, RIO;

	read_block();
@@ -200,17 +183,33 @@ blowfish_dec_blk:
	round_dec(3);
	add_roundkey_dec(1);

	popq RIO;
	movq %r10, RIO;
	write_block();

	popq %rbx;
	popq %rbp;
	movq %r11, %rbp;

	ret;

/**********************************************************************
  4-way blowfish, four blocks parallel
 **********************************************************************/

/* F() for 4-way. Slower when used alone/1-way, but faster when used
 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 */
#define F4(x) \
	movzbl x ## bh,		RT1d; \
	movzbl x ## bl,		RT3d; \
	rorq $16,		x; \
	movzbl x ## bh,		RT0d; \
	movzbl x ## bl,		RT2d; \
	rorq $16,		x; \
	movl s0(CTX,RT0,4),	RT0d; \
	addl s1(CTX,RT2,4),	RT0d; \
	xorl s2(CTX,RT1,4),	RT0d; \
	addl s3(CTX,RT3,4),	RT0d; \
	xorq RT0,		x;

#define add_preloaded_roundkey4() \
	xorq RKEY,		RX0; \
	xorq RKEY,		RX1; \
@@ -227,15 +226,15 @@ blowfish_dec_blk:
#define round_enc4(n) \
	add_roundkey_enc4(n); \
	\
	F(RX0, RK0); \
	F(RX1, RK1); \
	F(RX2, RK2); \
	F(RX3, RK3); \
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3); \
	\
	F(RX0, RK0); \
	F(RX1, RK1); \
	F(RX2, RK2); \
	F(RX3, RK3);
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3);

#define preload_roundkey_dec(n) \
	movq p+4*((n)-1)(CTX),	RKEY; \
@@ -248,15 +247,15 @@ blowfish_dec_blk:
#define round_dec4(n) \
	add_roundkey_dec4(n); \
	\
	F(RX0, RK0); \
	F(RX1, RK1); \
	F(RX2, RK2); \
	F(RX3, RK3); \
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3); \
	\
	F(RX0, RK0); \
	F(RX1, RK1); \
	F(RX2, RK2); \
	F(RX3, RK3);
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3);

#define read_block4() \
	movq (RIO),		RX0; \
@@ -306,18 +305,19 @@ blowfish_dec_blk:
.type   __blowfish_enc_blk_4way,@function;

__blowfish_enc_blk_4way:
	// input:
	//	%rdi: ctx, CTX
	//	%rsi: dst
	//	%rdx: src
	//	%rcx: bool xor
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool, if true: xor output
	 */
	pushq %rbp;
	pushq %rbx;
	pushq RKEY;
	pushq %rcx;

	preload_roundkey_enc(0);

	pushq %rsi;
	pushq %rcx;
	movq %rsi, %r11;
	movq %rdx, RIO;

	read_block4();
@@ -333,40 +333,39 @@ __blowfish_enc_blk_4way:
	add_preloaded_roundkey4();

	popq %rbp;
	popq RIO;
	movq %r11, RIO;

	test %bpl, %bpl;
	jnz __enc_xor4;

	write_block4();

__enc_ret4:
	popq RKEY;
	popq %rbx;
	popq %rbp;

	ret;

__enc_xor4:
	xor_block4();

	jmp __enc_ret4;
	popq %rbx;
	popq %rbp;
	ret;

.align 8
.global blowfish_dec_blk_4way
.type   blowfish_dec_blk_4way,@function;

blowfish_dec_blk_4way:
	// input:
	//	%rdi: ctx, CTX
	//	%rsi: dst
	//	%rdx: src
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 */
	pushq %rbp;
	pushq %rbx;
	pushq RKEY;
	preload_roundkey_dec(17);

	pushq %rsi;
	movq %rsi, %r11;
	movq %rdx, RIO;

	read_block4();
@@ -381,10 +380,9 @@ blowfish_dec_blk_4way:
	round_dec4(3);
	add_preloaded_roundkey4();

	popq RIO;
	movq %r11, RIO;
	write_block4();

	popq RKEY;
	popq %rbx;
	popq %rbp;