Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit dfb09f9b authored by Borislav Petkov's avatar Borislav Petkov Committed by H. Peter Anvin
Browse files

x86, amd: Avoid cache aliasing penalties on AMD family 15h



This patch provides performance tuning for the "Bulldozer" CPU. With its
shared instruction cache there is a chance of generating an excessive
number of cache cross-invalidates when running specific workloads on the
cores of a compute module.

This excessive amount of cross-invalidations can be observed if cache
lines backed by shared physical memory alias in bits [14:12] of their
virtual addresses, as those bits are used for the index generation.

This patch addresses the issue by clearing all the bits in the [14:12]
slice of the file mapping's virtual address at generation time, thus
forcing those bits the same for all mappings of a single shared library
across processes and, in doing so, avoids instruction cache aliases.

It also adds the command line option "align_va_addr=(32|64|on|off)" with
which virtual address alignment can be enabled for 32-bit or 64-bit x86
individually, or both, or be completely disabled.

This change leaves virtual region address allocation on other families
and/or vendors unaffected.

Signed-off-by: default avatarBorislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1312550110-24160-2-git-send-email-bp@amd64.org


Signed-off-by: default avatarH. Peter Anvin <hpa@linux.intel.com>
parent 13f9a373
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -299,6 +299,19 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
			behaviour to be specified.  Bit 0 enables warnings,
			bit 1 enables fixups, and bit 2 sends a segfault.

	align_va_addr=	[X86-64]
			Align virtual addresses by clearing slice [14:12] when
			allocating a VMA at process creation time. This option
			gives you up to 3% performance improvement on AMD F15h
			machines (where it is enabled by default) for a
			CPU-intensive style benchmark, and it can vary highly in
			a microbenchmark depending on workload and compiler.

			1: only for 32-bit processes
			2: only for 64-bit processes
			on: enable for both 32- and 64-bit processes
			off: disable for both 32- and 64-bit processes

	amd_iommu=	[HW,X86-84]
			Pass parameters to the AMD IOMMU driver in the system.
			Possible values are:
+31 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@
/*
 * ELF register definitions..
 */
#include <linux/thread_info.h>

#include <asm/ptrace.h>
#include <asm/user.h>
@@ -320,4 +321,34 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
extern unsigned long arch_randomize_brk(struct mm_struct *mm);
#define arch_randomize_brk arch_randomize_brk

/*
 * True on X86_32 or when emulating IA32 on X86_64
 */
static inline int mmap_is_ia32(void)
{
#ifdef CONFIG_X86_32
	return 1;
#endif
#ifdef CONFIG_IA32_EMULATION
	if (test_thread_flag(TIF_IA32))
		return 1;
#endif
	return 0;
}

/* The first two values are special, do not change. See align_addr() */
enum align_flags {
	ALIGN_VA_32	= BIT(0),
	ALIGN_VA_64	= BIT(1),
	ALIGN_VDSO	= BIT(2),
	ALIGN_TOPDOWN	= BIT(3),
};

struct va_alignment {
	int flags;
	unsigned long mask;
} ____cacheline_aligned;

extern struct va_alignment va_align;
extern unsigned long align_addr(unsigned long, struct file *, enum align_flags);
#endif /* _ASM_X86_ELF_H */
+13 −0
Original line number Diff line number Diff line
@@ -458,6 +458,19 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
					"with P0 frequency!\n");
		}
	}

	if (c->x86 == 0x15) {
		unsigned long upperbit;
		u32 cpuid, assoc;

		cpuid	 = cpuid_edx(0x80000005);
		assoc	 = cpuid >> 16 & 0xff;
		upperbit = ((cpuid >> 24) << 10) / assoc;

		va_align.mask	  = (upperbit - 1) & PAGE_MASK;
		va_align.flags    = ALIGN_VA_32 | ALIGN_VA_64;

	}
}

static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+78 −3
Original line number Diff line number Diff line
@@ -18,6 +18,72 @@
#include <asm/ia32.h>
#include <asm/syscalls.h>

struct __read_mostly va_alignment va_align = {
	.flags = -1,
};

/*
 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
 *
 * @flags denotes the allocation direction - bottomup or topdown -
 * or vDSO; see call sites below.
 */
unsigned long align_addr(unsigned long addr, struct file *filp,
			 enum align_flags flags)
{
	unsigned long tmp_addr;

	/* handle 32- and 64-bit case with a single conditional */
	if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
		return addr;

	if (!(current->flags & PF_RANDOMIZE))
		return addr;

	if (!((flags & ALIGN_VDSO) || filp))
		return addr;

	tmp_addr = addr;

	/*
	 * We need an address which is <= than the original
	 * one only when in topdown direction.
	 */
	if (!(flags & ALIGN_TOPDOWN))
		tmp_addr += va_align.mask;

	tmp_addr &= ~va_align.mask;

	return tmp_addr;
}

static int __init control_va_addr_alignment(char *str)
{
	/* guard against enabling this on other CPU families */
	if (va_align.flags < 0)
		return 1;

	if (*str == 0)
		return 1;

	if (*str == '=')
		str++;

	if (!strcmp(str, "32"))
		va_align.flags = ALIGN_VA_32;
	else if (!strcmp(str, "64"))
		va_align.flags = ALIGN_VA_64;
	else if (!strcmp(str, "off"))
		va_align.flags = 0;
	else if (!strcmp(str, "on"))
		va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
	else
		return 0;

	return 1;
}
__setup("align_va_addr", control_va_addr_alignment);

SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
		unsigned long, prot, unsigned long, flags,
		unsigned long, fd, unsigned long, off)
@@ -92,6 +158,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
	start_addr = addr;

full_search:

	addr = align_addr(addr, filp, 0);

	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
		/* At this point:  (!vma || addr < vma->vm_end). */
		if (end - len < addr) {
@@ -117,6 +186,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
			mm->cached_hole_size = vma->vm_start - addr;

		addr = vma->vm_end;
		addr = align_addr(addr, filp, 0);
	}
}

@@ -161,10 +231,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,

	/* make sure it can fit in the remaining address space */
	if (addr > len) {
		vma = find_vma(mm, addr-len);
		if (!vma || addr <= vma->vm_start)
		unsigned long tmp_addr = align_addr(addr - len, filp,
						    ALIGN_TOPDOWN);

		vma = find_vma(mm, tmp_addr);
		if (!vma || tmp_addr + len <= vma->vm_start)
			/* remember the address as a hint for next time */
			return mm->free_area_cache = addr-len;
			return mm->free_area_cache = tmp_addr;
	}

	if (mm->mmap_base < len)
@@ -173,6 +246,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
	addr = mm->mmap_base-len;

	do {
		addr = align_addr(addr, filp, ALIGN_TOPDOWN);

		/*
		 * Lookup failure means no vma is above this address,
		 * else if new region fits below vma->vm_start,
+0 −15
Original line number Diff line number Diff line
@@ -51,21 +51,6 @@ static unsigned int stack_maxrandom_size(void)
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)

/*
 * True on X86_32 or when emulating IA32 on X86_64
 */
static int mmap_is_ia32(void)
{
#ifdef CONFIG_X86_32
	return 1;
#endif
#ifdef CONFIG_IA32_EMULATION
	if (test_thread_flag(TIF_IA32))
		return 1;
#endif
	return 0;
}

static int mmap_is_legacy(void)
{
	if (current->personality & ADDR_COMPAT_LAYOUT)
Loading