Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b4f656ee authored by Russell King's avatar Russell King
Browse files

Pull branch 'for-rmk' of git://git.linaro.org/people/ardbiesheuvel/linux-arm into devel-stable

Comments from Ard Biesheuvel:

I have included two use cases that I have been using, XOR and RAID-6
checksumming. The former gets a 60% performance boost on the NEON, the
latter over 400%.

ARM: add support for kernel mode NEON

Adds kernel_neon_begin/end (renamed from kernel_vfp_begin/end in the
previous version to de-emphasize the VFP part as VFP code that needs
software assistance is not supported currently.)

Introduces <asm/neon.h> and the Kconfig symbol KERNEL_MODE_NEON. This
has been aligned with Catalin for arm64, so any NEON code that does
not use assembly but intrinsics or the GCC vectorizer (such as my
examples) can potentially be shared between arm and arm64 archs.

ARM: move VFP init to an earlier boot stage

This is needed so the NEON is enabled when the XOR and RAID-6 algo
boot time benchmarks are run.

ARM: be strict about FP exceptions in kernel mode

This adds a check to vfp_support_entry() to flag unsupported uses of
the NEON/VFP in kernel mode. FP exceptions (bounces) are flagged as
a bug, this is because of their potentially intermittent nature.
Exceptions caused by the fact that kernel_neon_begin has not been
called are just routed through the undef handler.

ARM: crypto: add NEON accelerated XOR implementation

This is the xor_blocks() implementation built with -ftree-vectorize,
60% faster than optimized ARM code. It calls in_interrupt() to check
whether the NEON flavor can be used: this should really not be
necessary, but due to xor_blocks'squite generic nature, there is no
telling how exactly people may be using it in the real world.

lib/raid6: add ARM-NEON accelerated syndrome calculation

This is a port of the RAID-6 checksumming code in altivec.uc ported
to use NEON intrinsics. It is about 4x faster than the sequential
code.
parents 3b2f64d0 7d11965d
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -2176,6 +2176,13 @@ config NEON
	  Say Y to include support code for NEON, the ARMv7 Advanced SIMD
	  Extension.

config KERNEL_MODE_NEON
	bool "Support for NEON in kernel mode"
	default n
	depends on NEON
	help
	  Say Y to include support for NEON in kernel mode.

endmenu

menu "Userspace binary formats"
+36 −0
Original line number Diff line number Diff line
/*
 * linux/arch/arm/include/asm/neon.h
 *
 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <asm/hwcap.h>

#define cpu_has_neon()		(!!(elf_hwcap & HWCAP_NEON))

#ifdef __ARM_NEON__

/*
 * If you are affected by the BUILD_BUG below, it probably means that you are
 * using NEON code /and/ calling the kernel_neon_begin() function from the same
 * compilation unit. To prevent issues that may arise from GCC reordering or
 * generating(1) NEON instructions outside of these begin/end functions, the
 * only supported way of using NEON code in the kernel is by isolating it in a
 * separate compilation unit, and calling it from another unit from inside a
 * kernel_neon_begin/kernel_neon_end pair.
 *
 * (1) Current GCC (4.7) might generate NEON instructions at O3 level if
 *     -mpfu=neon is set.
 */

#define kernel_neon_begin() \
	BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")

#else
void kernel_neon_begin(void);
#endif
void kernel_neon_end(void);
+73 −0
Original line number Diff line number Diff line
@@ -7,7 +7,10 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
#include <linux/hardirq.h>
#include <asm-generic/xor.h>
#include <asm/hwcap.h>
#include <asm/neon.h>

#define __XOR(a1, a2) a1 ^= a2

@@ -138,4 +141,74 @@ static struct xor_block_template xor_block_arm4regs = {
		xor_speed(&xor_block_arm4regs);	\
		xor_speed(&xor_block_8regs);	\
		xor_speed(&xor_block_32regs);	\
		NEON_TEMPLATES;			\
	} while (0)

#ifdef CONFIG_KERNEL_MODE_NEON

extern struct xor_block_template const xor_block_neon_inner;

static void
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
	if (in_interrupt()) {
		xor_arm4regs_2(bytes, p1, p2);
	} else {
		kernel_neon_begin();
		xor_block_neon_inner.do_2(bytes, p1, p2);
		kernel_neon_end();
	}
}

static void
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
		unsigned long *p3)
{
	if (in_interrupt()) {
		xor_arm4regs_3(bytes, p1, p2, p3);
	} else {
		kernel_neon_begin();
		xor_block_neon_inner.do_3(bytes, p1, p2, p3);
		kernel_neon_end();
	}
}

static void
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
		unsigned long *p3, unsigned long *p4)
{
	if (in_interrupt()) {
		xor_arm4regs_4(bytes, p1, p2, p3, p4);
	} else {
		kernel_neon_begin();
		xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
		kernel_neon_end();
	}
}

static void
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
		unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
	if (in_interrupt()) {
		xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
	} else {
		kernel_neon_begin();
		xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
		kernel_neon_end();
	}
}

static struct xor_block_template xor_block_neon = {
	.name	= "neon",
	.do_2	= xor_neon_2,
	.do_3	= xor_neon_3,
	.do_4	= xor_neon_4,
	.do_5	= xor_neon_5
};

#define NEON_TEMPLATES	\
	do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
#else
#define NEON_TEMPLATES
#endif
+6 −0
Original line number Diff line number Diff line
@@ -45,3 +45,9 @@ lib-$(CONFIG_ARCH_SHARK) += io-shark.o

$(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
$(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S

ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
  NEON_FLAGS			:= -mfloat-abi=softfp -mfpu=neon
  CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
  lib-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
endif
+42 −0
Original line number Diff line number Diff line
/*
 * linux/arch/arm/lib/xor-neon.c
 *
 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/raid/xor.h>

#ifndef __ARM_NEON__
#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
#endif

/*
 * Pull in the reference implementations while instructing GCC (through
 * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
 * NEON instructions.
 */
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
#pragma GCC optimize "tree-vectorize"
#else
/*
 * While older versions of GCC do not generate incorrect code, they fail to
 * recognize the parallel nature of these functions, and emit plain ARM code,
 * which is known to be slower than the optimized ARM code in asm-arm/xor.h.
 */
#warning This code requires at least version 4.6 of GCC
#endif

#pragma GCC diagnostic ignored "-Wunused-variable"
#include <asm-generic/xor.h>

struct xor_block_template const xor_block_neon_inner = {
	.name	= "__inner_neon__",
	.do_2	= xor_8regs_2,
	.do_3	= xor_8regs_3,
	.do_4	= xor_8regs_4,
	.do_5	= xor_8regs_5,
};
Loading