Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b8716614 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull crypto update from Herbert Xu:
 "* sha512 bug fixes (already in your tree).
  * SHA224/SHA384 AEAD support in caam.
  * X86-64 optimised version of Camellia.
  * Tegra AES support.
  * Bulk algorithm registration interface to make driver registration easier.
  * padata race fixes.
  * Misc fixes."

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (31 commits)
  padata: Fix race on sequence number wrap
  padata: Fix race in the serialization path
  crypto: camellia - add assembler implementation for x86_64
  crypto: camellia - rename camellia.c to camellia_generic.c
  crypto: camellia - fix checkpatch warnings
  crypto: camellia - rename camellia module to camellia_generic
  crypto: tcrypt - add more camellia tests
  crypto: testmgr - add more camellia test vectors
  crypto: camellia - simplify key setup and CAMELLIA_ROUNDSM macro
  crypto: twofish-x86_64/i586 - set alignmask to zero
  crypto: blowfish-x86_64 - set alignmask to zero
  crypto: serpent-sse2 - combine ablk_*_init functions
  crypto: blowfish-x86_64 - use crypto_[un]register_algs
  crypto: twofish-x86_64-3way - use crypto_[un]register_algs
  crypto: serpent-sse2 - use crypto_[un]register_algs
  crypto: serpent-sse2 - remove dead code from serpent_sse2_glue.c::serpent_sse2_init()
  crypto: twofish-x86 - Remove dead code from twofish_glue_3way.c::init()
  crypto: In crypto_add_alg(), 'exact' wants to be initialized to 0
  crypto: caam - fix gcc 4.6 warning
  crypto: Add bulk algorithm registration interface
  ...
parents 31f67652 2dc9b5db
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@

#include <linux/kernel.h>
#include <linux/io.h>
#include <linux/module.h>

#include <mach/iomap.h>

@@ -58,6 +59,7 @@ unsigned long long tegra_chip_uid(void)
	hi = fuse_readl(FUSE_UID_HIGH);
	return (hi << 32ull) | lo;
}
EXPORT_SYMBOL(tegra_chip_uid);

int tegra_sku_id(void)
{
+2 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o

obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
@@ -25,6 +26,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o

aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+94 −97
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
 *
 */

#include <asm/processor.h>
#include <crypto/blowfish.h>
#include <linux/crypto.h>
#include <linux/init.h>
@@ -76,27 +77,6 @@ static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
	blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
}

static struct crypto_alg bf_alg = {
	.cra_name		=	"blowfish",
	.cra_driver_name	=	"blowfish-asm",
	.cra_priority		=	200,
	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
	.cra_blocksize		=	BF_BLOCK_SIZE,
	.cra_ctxsize		=	sizeof(struct bf_ctx),
	.cra_alignmask		=	3,
	.cra_module		=	THIS_MODULE,
	.cra_list		=	LIST_HEAD_INIT(bf_alg.cra_list),
	.cra_u			=	{
		.cipher = {
			.cia_min_keysize	=	BF_MIN_KEY_SIZE,
			.cia_max_keysize	=	BF_MAX_KEY_SIZE,
			.cia_setkey		=	blowfish_setkey,
			.cia_encrypt		=	blowfish_encrypt,
			.cia_decrypt		=	blowfish_decrypt,
		}
	}
};

static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
		     void (*fn)(struct bf_ctx *, u8 *, const u8 *),
		     void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
@@ -160,28 +140,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
	return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way);
}

static struct crypto_alg blk_ecb_alg = {
	.cra_name		= "ecb(blowfish)",
	.cra_driver_name	= "ecb-blowfish-asm",
	.cra_priority		= 300,
	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
	.cra_blocksize		= BF_BLOCK_SIZE,
	.cra_ctxsize		= sizeof(struct bf_ctx),
	.cra_alignmask		= 0,
	.cra_type		= &crypto_blkcipher_type,
	.cra_module		= THIS_MODULE,
	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
	.cra_u = {
		.blkcipher = {
			.min_keysize	= BF_MIN_KEY_SIZE,
			.max_keysize	= BF_MAX_KEY_SIZE,
			.setkey		= blowfish_setkey,
			.encrypt	= ecb_encrypt,
			.decrypt	= ecb_decrypt,
		},
	},
};

static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
				  struct blkcipher_walk *walk)
{
@@ -307,29 +265,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
	return err;
}

static struct crypto_alg blk_cbc_alg = {
	.cra_name		= "cbc(blowfish)",
	.cra_driver_name	= "cbc-blowfish-asm",
	.cra_priority		= 300,
	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
	.cra_blocksize		= BF_BLOCK_SIZE,
	.cra_ctxsize		= sizeof(struct bf_ctx),
	.cra_alignmask		= 0,
	.cra_type		= &crypto_blkcipher_type,
	.cra_module		= THIS_MODULE,
	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
	.cra_u = {
		.blkcipher = {
			.min_keysize	= BF_MIN_KEY_SIZE,
			.max_keysize	= BF_MAX_KEY_SIZE,
			.ivsize		= BF_BLOCK_SIZE,
			.setkey		= blowfish_setkey,
			.encrypt	= cbc_encrypt,
			.decrypt	= cbc_decrypt,
		},
	},
};

static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
{
	u8 *ctrblk = walk->iv;
@@ -423,7 +358,67 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
	return err;
}

static struct crypto_alg blk_ctr_alg = {
static struct crypto_alg bf_algs[4] = { {
	.cra_name		= "blowfish",
	.cra_driver_name	= "blowfish-asm",
	.cra_priority		= 200,
	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
	.cra_blocksize		= BF_BLOCK_SIZE,
	.cra_ctxsize		= sizeof(struct bf_ctx),
	.cra_alignmask		= 0,
	.cra_module		= THIS_MODULE,
	.cra_list		= LIST_HEAD_INIT(bf_algs[0].cra_list),
	.cra_u = {
		.cipher = {
			.cia_min_keysize	= BF_MIN_KEY_SIZE,
			.cia_max_keysize	= BF_MAX_KEY_SIZE,
			.cia_setkey		= blowfish_setkey,
			.cia_encrypt		= blowfish_encrypt,
			.cia_decrypt		= blowfish_decrypt,
		}
	}
}, {
	.cra_name		= "ecb(blowfish)",
	.cra_driver_name	= "ecb-blowfish-asm",
	.cra_priority		= 300,
	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
	.cra_blocksize		= BF_BLOCK_SIZE,
	.cra_ctxsize		= sizeof(struct bf_ctx),
	.cra_alignmask		= 0,
	.cra_type		= &crypto_blkcipher_type,
	.cra_module		= THIS_MODULE,
	.cra_list		= LIST_HEAD_INIT(bf_algs[1].cra_list),
	.cra_u = {
		.blkcipher = {
			.min_keysize	= BF_MIN_KEY_SIZE,
			.max_keysize	= BF_MAX_KEY_SIZE,
			.setkey		= blowfish_setkey,
			.encrypt	= ecb_encrypt,
			.decrypt	= ecb_decrypt,
		},
	},
}, {
	.cra_name		= "cbc(blowfish)",
	.cra_driver_name	= "cbc-blowfish-asm",
	.cra_priority		= 300,
	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
	.cra_blocksize		= BF_BLOCK_SIZE,
	.cra_ctxsize		= sizeof(struct bf_ctx),
	.cra_alignmask		= 0,
	.cra_type		= &crypto_blkcipher_type,
	.cra_module		= THIS_MODULE,
	.cra_list		= LIST_HEAD_INIT(bf_algs[2].cra_list),
	.cra_u = {
		.blkcipher = {
			.min_keysize	= BF_MIN_KEY_SIZE,
			.max_keysize	= BF_MAX_KEY_SIZE,
			.ivsize		= BF_BLOCK_SIZE,
			.setkey		= blowfish_setkey,
			.encrypt	= cbc_encrypt,
			.decrypt	= cbc_decrypt,
		},
	},
}, {
	.cra_name		= "ctr(blowfish)",
	.cra_driver_name	= "ctr-blowfish-asm",
	.cra_priority		= 300,
@@ -433,7 +428,7 @@ static struct crypto_alg blk_ctr_alg = {
	.cra_alignmask		= 0,
	.cra_type		= &crypto_blkcipher_type,
	.cra_module		= THIS_MODULE,
	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
	.cra_list		= LIST_HEAD_INIT(bf_algs[3].cra_list),
	.cra_u = {
		.blkcipher = {
			.min_keysize	= BF_MIN_KEY_SIZE,
@@ -444,43 +439,45 @@ static struct crypto_alg blk_ctr_alg = {
			.decrypt	= ctr_crypt,
		},
	},
};
} };

static bool is_blacklisted_cpu(void)
{
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
		return false;

	if (boot_cpu_data.x86 == 0x0f) {
		/*
		 * On Pentium 4, blowfish-x86_64 is slower than generic C
		 * implementation because use of 64bit rotates (which are really
		 * slow on P4). Therefore blacklist P4s.
		 */
		return true;
	}

	return false;
}

static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");

static int __init init(void)
{
	int err;
	if (!force && is_blacklisted_cpu()) {
		printk(KERN_INFO
			"blowfish-x86_64: performance on this CPU "
			"would be suboptimal: disabling "
			"blowfish-x86_64.\n");
		return -ENODEV;
	}

	err = crypto_register_alg(&bf_alg);
	if (err)
		goto bf_err;
	err = crypto_register_alg(&blk_ecb_alg);
	if (err)
		goto ecb_err;
	err = crypto_register_alg(&blk_cbc_alg);
	if (err)
		goto cbc_err;
	err = crypto_register_alg(&blk_ctr_alg);
	if (err)
		goto ctr_err;

	return 0;

ctr_err:
	crypto_unregister_alg(&blk_cbc_alg);
cbc_err:
	crypto_unregister_alg(&blk_ecb_alg);
ecb_err:
	crypto_unregister_alg(&bf_alg);
bf_err:
	return err;
	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
}

static void __exit fini(void)
{
	crypto_unregister_alg(&blk_ctr_alg);
	crypto_unregister_alg(&blk_cbc_alg);
	crypto_unregister_alg(&blk_ecb_alg);
	crypto_unregister_alg(&bf_alg);
	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
}

module_init(init);
+520 −0
Original line number Diff line number Diff line
/*
 * Camellia Cipher Algorithm (x86_64)
 *
 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 * USA
 *
 */

.file "camellia-x86_64-asm_64.S"
.text

.extern camellia_sp10011110;
.extern camellia_sp22000222;
.extern camellia_sp03303033;
.extern camellia_sp00444404;
.extern camellia_sp02220222;
.extern camellia_sp30333033;
.extern camellia_sp44044404;
.extern camellia_sp11101110;

#define sp10011110 camellia_sp10011110
#define sp22000222 camellia_sp22000222
#define sp03303033 camellia_sp03303033
#define sp00444404 camellia_sp00444404
#define sp02220222 camellia_sp02220222
#define sp30333033 camellia_sp30333033
#define sp44044404 camellia_sp44044404
#define sp11101110 camellia_sp11101110

#define CAMELLIA_TABLE_BYTE_LEN 272

/* struct camellia_ctx: */
#define key_table 0
#define key_length CAMELLIA_TABLE_BYTE_LEN

/* register macros */
#define CTX %rdi
#define RIO %rsi
#define RIOd %esi

#define RAB0 %rax
#define RCD0 %rcx
#define RAB1 %rbx
#define RCD1 %rdx

#define RAB0d %eax
#define RCD0d %ecx
#define RAB1d %ebx
#define RCD1d %edx

#define RAB0bl %al
#define RCD0bl %cl
#define RAB1bl %bl
#define RCD1bl %dl

#define RAB0bh %ah
#define RCD0bh %ch
#define RAB1bh %bh
#define RCD1bh %dh

#define RT0 %rsi
#define RT1 %rbp
#define RT2 %r8

#define RT0d %esi
#define RT1d %ebp
#define RT2d %r8d

#define RT2bl %r8b

#define RXOR %r9
#define RRBP %r10
#define RDST %r11

#define RXORd %r9d
#define RXORbl %r9b

#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
	movzbl ab ## bl,		tmp2 ## d; \
	movzbl ab ## bh,		tmp1 ## d; \
	rorq $16,			ab; \
	xorq T0(, tmp2, 8),		dst; \
	xorq T1(, tmp1, 8),		dst;

/**********************************************************************
  1-way camellia
 **********************************************************************/
#define roundsm(ab, subkey, cd) \
	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
	\
	xorq RT2,					cd ## 0;

#define fls(l, r, kl, kr) \
	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
	andl l ## 0d,					RT0d; \
	roll $1,					RT0d; \
	shlq $32,					RT0; \
	xorq RT0,					l ## 0; \
	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
	orq r ## 0,					RT1; \
	shrq $32,					RT1; \
	xorq RT1,					r ## 0; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
	orq l ## 0,					RT2; \
	shrq $32,					RT2; \
	xorq RT2,					l ## 0; \
	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
	andl r ## 0d,					RT0d; \
	roll $1,					RT0d; \
	shlq $32,					RT0; \
	xorq RT0,					r ## 0;

#define enc_rounds(i) \
	roundsm(RAB, i + 2, RCD); \
	roundsm(RCD, i + 3, RAB); \
	roundsm(RAB, i + 4, RCD); \
	roundsm(RCD, i + 5, RAB); \
	roundsm(RAB, i + 6, RCD); \
	roundsm(RCD, i + 7, RAB);

#define enc_fls(i) \
	fls(RAB, RCD, i + 0, i + 1);

#define enc_inpack() \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rolq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rorq $32,			RCD0; \
	xorq key_table(CTX),		RAB0;

#define enc_outunpack(op, max) \
	xorq key_table(CTX, max, 8),	RCD0; \
	rorq $32,			RCD0; \
	bswapq				RCD0; \
	op ## q RCD0,			(RIO); \
	rolq $32,			RAB0; \
	bswapq				RAB0; \
	op ## q RAB0,			4*2(RIO);

#define dec_rounds(i) \
	roundsm(RAB, i + 7, RCD); \
	roundsm(RCD, i + 6, RAB); \
	roundsm(RAB, i + 5, RCD); \
	roundsm(RCD, i + 4, RAB); \
	roundsm(RAB, i + 3, RCD); \
	roundsm(RCD, i + 2, RAB);

#define dec_fls(i) \
	fls(RAB, RCD, i + 1, i + 0);

#define dec_inpack(max) \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rolq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rorq $32,			RCD0; \
	xorq key_table(CTX, max, 8),	RAB0;

#define dec_outunpack() \
	xorq key_table(CTX),		RCD0; \
	rorq $32,			RCD0; \
	bswapq				RCD0; \
	movq RCD0,			(RIO); \
	rolq $32,			RAB0; \
	bswapq				RAB0; \
	movq RAB0,			4*2(RIO);

.global __camellia_enc_blk;
.type   __camellia_enc_blk,@function;

__camellia_enc_blk:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool xor
	 */
	movq %rbp, RRBP;

	movq %rcx, RXOR;
	movq %rsi, RDST;
	movq %rdx, RIO;

	enc_inpack();

	enc_rounds(0);
	enc_fls(8);
	enc_rounds(8);
	enc_fls(16);
	enc_rounds(16);
	movl $24, RT1d; /* max */

	cmpb $16, key_length(CTX);
	je __enc_done;

	enc_fls(24);
	enc_rounds(24);
	movl $32, RT1d; /* max */

__enc_done:
	testb RXORbl, RXORbl;
	movq RDST, RIO;

	jnz __enc_xor;

	enc_outunpack(mov, RT1);

	movq RRBP, %rbp;
	ret;

__enc_xor:
	enc_outunpack(xor, RT1);

	movq RRBP, %rbp;
	ret;

.global camellia_dec_blk;
.type   camellia_dec_blk,@function;

camellia_dec_blk:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 */
	cmpl $16, key_length(CTX);
	movl $32, RT2d;
	movl $24, RXORd;
	cmovel RXORd, RT2d; /* max */

	movq %rbp, RRBP;
	movq %rsi, RDST;
	movq %rdx, RIO;

	dec_inpack(RT2);

	cmpb $24, RT2bl;
	je __dec_rounds16;

	dec_rounds(24);
	dec_fls(24);

__dec_rounds16:
	dec_rounds(16);
	dec_fls(16);
	dec_rounds(8);
	dec_fls(8);
	dec_rounds(0);

	movq RDST, RIO;

	dec_outunpack();

	movq RRBP, %rbp;
	ret;

/**********************************************************************
  2-way camellia
 **********************************************************************/
#define roundsm2(ab, subkey, cd) \
	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
	xorq RT2,					cd ## 1; \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
	\
		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
		xorq RT2,					cd ## 0; \
		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);

#define fls2(l, r, kl, kr) \
	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
	andl l ## 0d,					RT0d; \
	roll $1,					RT0d; \
	shlq $32,					RT0; \
	xorq RT0,					l ## 0; \
	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
	orq r ## 0,					RT1; \
	shrq $32,					RT1; \
	xorq RT1,					r ## 0; \
	\
		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
		andl l ## 1d,					RT2d; \
		roll $1,					RT2d; \
		shlq $32,					RT2; \
		xorq RT2,					l ## 1; \
		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
		orq r ## 1,					RT0; \
		shrq $32,					RT0; \
		xorq RT0,					r ## 1; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
	orq l ## 0,					RT1; \
	shrq $32,					RT1; \
	xorq RT1,					l ## 0; \
	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
	andl r ## 0d,					RT2d; \
	roll $1,					RT2d; \
	shlq $32,					RT2; \
	xorq RT2,					r ## 0; \
	\
		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
		orq l ## 1,					RT0; \
		shrq $32,					RT0; \
		xorq RT0,					l ## 1; \
		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
		andl r ## 1d,					RT1d; \
		roll $1,					RT1d; \
		shlq $32,					RT1; \
		xorq RT1,					r ## 1;

#define enc_rounds2(i) \
	roundsm2(RAB, i + 2, RCD); \
	roundsm2(RCD, i + 3, RAB); \
	roundsm2(RAB, i + 4, RCD); \
	roundsm2(RCD, i + 5, RAB); \
	roundsm2(RAB, i + 6, RCD); \
	roundsm2(RCD, i + 7, RAB);

#define enc_fls2(i) \
	fls2(RAB, RCD, i + 0, i + 1);

#define enc_inpack2() \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rorq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rolq $32,			RCD0; \
	xorq key_table(CTX),		RAB0; \
	\
		movq 8*2(RIO),			RAB1; \
		bswapq				RAB1; \
		rorq $32,			RAB1; \
		movq 12*2(RIO),			RCD1; \
		bswapq				RCD1; \
		rolq $32,			RCD1; \
		xorq key_table(CTX),		RAB1;

#define enc_outunpack2(op, max) \
	xorq key_table(CTX, max, 8),	RCD0; \
	rolq $32,			RCD0; \
	bswapq				RCD0; \
	op ## q RCD0,			(RIO); \
	rorq $32,			RAB0; \
	bswapq				RAB0; \
	op ## q RAB0,			4*2(RIO); \
	\
		xorq key_table(CTX, max, 8),	RCD1; \
		rolq $32,			RCD1; \
		bswapq				RCD1; \
		op ## q RCD1,			8*2(RIO); \
		rorq $32,			RAB1; \
		bswapq				RAB1; \
		op ## q RAB1,			12*2(RIO);

#define dec_rounds2(i) \
	roundsm2(RAB, i + 7, RCD); \
	roundsm2(RCD, i + 6, RAB); \
	roundsm2(RAB, i + 5, RCD); \
	roundsm2(RCD, i + 4, RAB); \
	roundsm2(RAB, i + 3, RCD); \
	roundsm2(RCD, i + 2, RAB);

#define dec_fls2(i) \
	fls2(RAB, RCD, i + 1, i + 0);

#define dec_inpack2(max) \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rorq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rolq $32,			RCD0; \
	xorq key_table(CTX, max, 8),	RAB0; \
	\
		movq 8*2(RIO),			RAB1; \
		bswapq				RAB1; \
		rorq $32,			RAB1; \
		movq 12*2(RIO),			RCD1; \
		bswapq				RCD1; \
		rolq $32,			RCD1; \
		xorq key_table(CTX, max, 8),	RAB1;

#define dec_outunpack2() \
	xorq key_table(CTX),		RCD0; \
	rolq $32,			RCD0; \
	bswapq				RCD0; \
	movq RCD0,			(RIO); \
	rorq $32,			RAB0; \
	bswapq				RAB0; \
	movq RAB0,			4*2(RIO); \
	\
		xorq key_table(CTX),		RCD1; \
		rolq $32,			RCD1; \
		bswapq				RCD1; \
		movq RCD1,			8*2(RIO); \
		rorq $32,			RAB1; \
		bswapq				RAB1; \
		movq RAB1,			12*2(RIO);

.global __camellia_enc_blk_2way;
.type   __camellia_enc_blk_2way,@function;

__camellia_enc_blk_2way:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool xor
	 */
	pushq %rbx;

	movq %rbp, RRBP;
	movq %rcx, RXOR;
	movq %rsi, RDST;
	movq %rdx, RIO;

	enc_inpack2();

	enc_rounds2(0);
	enc_fls2(8);
	enc_rounds2(8);
	enc_fls2(16);
	enc_rounds2(16);
	movl $24, RT2d; /* max */

	cmpb $16, key_length(CTX);
	je __enc2_done;

	enc_fls2(24);
	enc_rounds2(24);
	movl $32, RT2d; /* max */

__enc2_done:
	test RXORbl, RXORbl;
	movq RDST, RIO;
	jnz __enc2_xor;

	enc_outunpack2(mov, RT2);

	movq RRBP, %rbp;
	popq %rbx;
	ret;

__enc2_xor:
	enc_outunpack2(xor, RT2);

	movq RRBP, %rbp;
	popq %rbx;
	ret;

.global camellia_dec_blk_2way;
.type   camellia_dec_blk_2way,@function;

camellia_dec_blk_2way:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 */
	cmpl $16, key_length(CTX);
	movl $32, RT2d;
	movl $24, RXORd;
	cmovel RXORd, RT2d; /* max */

	movq %rbx, RXOR;
	movq %rbp, RRBP;
	movq %rsi, RDST;
	movq %rdx, RIO;

	dec_inpack2(RT2);

	cmpb $24, RT2bl;
	je __dec2_rounds16;

	dec_rounds2(24);
	dec_fls2(24);

__dec2_rounds16:
	dec_rounds2(16);
	dec_fls2(16);
	dec_rounds2(8);
	dec_fls2(8);
	dec_rounds2(0);

	movq RDST, RIO;

	dec_outunpack2();

	movq RRBP, %rbp;
	movq RXOR, %rbx;
	ret;
+1952 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading