Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c22ce143 authored by Hiro Yoshioka's avatar Hiro Yoshioka Committed by Linus Torvalds
Browse files

[PATCH] x86: cache pollution aware __copy_from_user_ll()



Use the x86 cache-bypassing copy instructions for copy_from_user().

Some performance data are

Total of GLOBAL_POWER_EVENTS (CPU cycle samples)

2.6.12.4.orig    1921587
2.6.12.4.nt      1599424
1599424/1921587=83.23% (16.77% reduction)

BSQ_CACHE_REFERENCE (L3 cache miss)
2.6.12.4.orig      57427
2.6.12.4.nt        20858
20858/57427=36.32% (63.7% reduction)

L3 cache miss reduction of __copy_from_user_ll
samples  %
37408    65.1412  vmlinux                  __copy_from_user_ll
23        0.1103  vmlinux                  __copy_user_zeroing_intel_nocache
23/37408=0.061% (99.94% reduction)

Top 5 of 2.6.12.4.nt
Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000
samples  %        app name                 symbol name
128392    8.0274  vmlinux                  __copy_user_zeroing_intel_nocache
64206     4.0143  vmlinux                  journal_add_journal_head
59746     3.7355  vmlinux                  do_get_write_access
47674     2.9807  vmlinux                  journal_put_journal_head
46021     2.8774  vmlinux                  journal_dirty_metadata
pattern9-0-cpu4-0-09011728/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000
samples  %        app name                 symbol name
69755     4.2861  vmlinux                  __copy_user_zeroing_intel_nocache
55685     3.4215  vmlinux                  journal_add_journal_head
52371     3.2179  vmlinux                  __find_get_block
45504     2.7960  vmlinux                  journal_put_journal_head
36005     2.2123  vmlinux                  journal_stop
pattern9-0-cpu4-0-09011744/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000
samples  %        app name                 symbol name
1147      5.4994  vmlinux                  journal_add_journal_head
881       4.2240  vmlinux                  journal_dirty_data
872       4.1809  vmlinux                  blk_rq_map_sg
734       3.5192  vmlinux                  journal_commit_transaction
617       2.9582  vmlinux                  radix_tree_delete
pattern9-0-cpu4-0-09011731/summary.out

iozone results are

original 2.6.12.4 CPU time = 207.768 sec
cache aware       CPU time = 184.783 sec
(three times run)
184.783/207.768=88.94% (11.06% reduction)

original:
pattern9-0-cpu4-0-08191720/iozone.out:  CPU Utilization: Wall time   45.997    CPU time   64.527    CPU utilization 140.28 %
pattern9-0-cpu4-0-08191741/iozone.out:  CPU Utilization: Wall time   46.878    CPU time   71.933    CPU utilization 153.45 %
pattern9-0-cpu4-0-08191743/iozone.out:  CPU Utilization: Wall time   45.152    CPU time   71.308    CPU utilization 157.93 %

cache awre:
pattern9-0-cpu4-0-09011728/iozone.out:  CPU Utilization: Wall time   44.842    CPU time   62.465    CPU utilization 139.30 %
pattern9-0-cpu4-0-09011731/iozone.out:  CPU Utilization: Wall time   44.718    CPU time   59.273    CPU utilization 132.55 %
pattern9-0-cpu4-0-09011744/iozone.out:  CPU Utilization: Wall time   44.367    CPU time   63.045    CPU utilization 142.10 %

Signed-off-by: default avatarHiro Yoshioka <hyoshiok@miraclelinux.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 7dbdf43c
Loading
Loading
Loading
Loading
+129 −8
Original line number Diff line number Diff line
@@ -425,15 +425,121 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
		       : "eax", "edx", "memory");
	return size;
}

/*
 * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
 * hyoshiok@miraclelinux.com
 */

static unsigned long __copy_user_zeroing_intel_nocache(void *to,
				const void __user *from, unsigned long size)
{
        int d0, d1;

	__asm__ __volatile__(
	       "        .align 2,0x90\n"
	       "0:      movl 32(%4), %%eax\n"
	       "        cmpl $67, %0\n"
	       "        jbe 2f\n"
	       "1:      movl 64(%4), %%eax\n"
	       "        .align 2,0x90\n"
	       "2:      movl 0(%4), %%eax\n"
	       "21:     movl 4(%4), %%edx\n"
	       "        movnti %%eax, 0(%3)\n"
	       "        movnti %%edx, 4(%3)\n"
	       "3:      movl 8(%4), %%eax\n"
	       "31:     movl 12(%4),%%edx\n"
	       "        movnti %%eax, 8(%3)\n"
	       "        movnti %%edx, 12(%3)\n"
	       "4:      movl 16(%4), %%eax\n"
	       "41:     movl 20(%4), %%edx\n"
	       "        movnti %%eax, 16(%3)\n"
	       "        movnti %%edx, 20(%3)\n"
	       "10:     movl 24(%4), %%eax\n"
	       "51:     movl 28(%4), %%edx\n"
	       "        movnti %%eax, 24(%3)\n"
	       "        movnti %%edx, 28(%3)\n"
	       "11:     movl 32(%4), %%eax\n"
	       "61:     movl 36(%4), %%edx\n"
	       "        movnti %%eax, 32(%3)\n"
	       "        movnti %%edx, 36(%3)\n"
	       "12:     movl 40(%4), %%eax\n"
	       "71:     movl 44(%4), %%edx\n"
	       "        movnti %%eax, 40(%3)\n"
	       "        movnti %%edx, 44(%3)\n"
	       "13:     movl 48(%4), %%eax\n"
	       "81:     movl 52(%4), %%edx\n"
	       "        movnti %%eax, 48(%3)\n"
	       "        movnti %%edx, 52(%3)\n"
	       "14:     movl 56(%4), %%eax\n"
	       "91:     movl 60(%4), %%edx\n"
	       "        movnti %%eax, 56(%3)\n"
	       "        movnti %%edx, 60(%3)\n"
	       "        addl $-64, %0\n"
	       "        addl $64, %4\n"
	       "        addl $64, %3\n"
	       "        cmpl $63, %0\n"
	       "        ja  0b\n"
	       "        sfence \n"
	       "5:      movl  %0, %%eax\n"
	       "        shrl  $2, %0\n"
	       "        andl $3, %%eax\n"
	       "        cld\n"
	       "6:      rep; movsl\n"
	       "        movl %%eax,%0\n"
	       "7:      rep; movsb\n"
	       "8:\n"
	       ".section .fixup,\"ax\"\n"
	       "9:      lea 0(%%eax,%0,4),%0\n"
	       "16:     pushl %0\n"
	       "        pushl %%eax\n"
	       "        xorl %%eax,%%eax\n"
	       "        rep; stosb\n"
	       "        popl %%eax\n"
	       "        popl %0\n"
	       "        jmp 8b\n"
	       ".previous\n"
	       ".section __ex_table,\"a\"\n"
	       "	.align 4\n"
	       "	.long 0b,16b\n"
	       "	.long 1b,16b\n"
	       "	.long 2b,16b\n"
	       "	.long 21b,16b\n"
	       "	.long 3b,16b\n"
	       "	.long 31b,16b\n"
	       "	.long 4b,16b\n"
	       "	.long 41b,16b\n"
	       "	.long 10b,16b\n"
	       "	.long 51b,16b\n"
	       "	.long 11b,16b\n"
	       "	.long 61b,16b\n"
	       "	.long 12b,16b\n"
	       "	.long 71b,16b\n"
	       "	.long 13b,16b\n"
	       "	.long 81b,16b\n"
	       "	.long 14b,16b\n"
	       "	.long 91b,16b\n"
	       "	.long 6b,9b\n"
	       "        .long 7b,16b\n"
	       ".previous"
	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
	       :  "1"(to), "2"(from), "0"(size)
	       : "eax", "edx", "memory");
	return size;
}

#else

/*
 * Leave these declared but undefined.  They should not be any references to
 * them
 */
unsigned long
__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size);
unsigned long
__copy_user_intel(void __user *to, const void *from, unsigned long size);
unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
					unsigned long size);
unsigned long __copy_user_intel(void __user *to, const void *from,
					unsigned long size);
unsigned long __copy_user_zeroing_intel_nocache(void *to,
				const void __user *from, unsigned long size);
#endif /* CONFIG_X86_INTEL_USERCOPY */

/* Generic arbitrary sized copy.  */
@@ -515,8 +621,8 @@ do { \
		: "memory");						\
} while (0)


unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n)
unsigned long __copy_to_user_ll(void __user *to, const void *from,
				unsigned long n)
{
	BUG_ON((long) n < 0);
#ifndef CONFIG_X86_WP_WORKS_OK
@@ -576,8 +682,8 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long
}
EXPORT_SYMBOL(__copy_to_user_ll);

unsigned long
__copy_from_user_ll(void *to, const void __user *from, unsigned long n)
unsigned long __copy_from_user_ll(void *to, const void __user *from,
					unsigned long n)
{
	BUG_ON((long)n < 0);
	if (movsl_is_ok(to, from, n))
@@ -588,6 +694,21 @@ __copy_from_user_ll(void *to, const void __user *from, unsigned long n)
}
EXPORT_SYMBOL(__copy_from_user_ll);

unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
					unsigned long n)
{
	BUG_ON((long)n < 0);
#ifdef CONFIG_X86_INTEL_USERCOPY
	if ( n > 64 && cpu_has_xmm2)
                n = __copy_user_zeroing_intel_nocache(to, from, n);
	else
		__copy_user_zeroing(to, from, n);
#else
        __copy_user_zeroing(to, from, n);
#endif
	return n;
}

/**
 * copy_to_user: - Copy a block of data into user space.
 * @to:   Destination address, in user space.
+33 −0
Original line number Diff line number Diff line
@@ -390,6 +390,8 @@ unsigned long __must_check __copy_to_user_ll(void __user *to,
				const void *from, unsigned long n);
unsigned long __must_check __copy_from_user_ll(void *to,
				const void __user *from, unsigned long n);
unsigned long __must_check __copy_from_user_ll_nocache(void *to,
				const void __user *from, unsigned long n);

/*
 * Here we special-case 1, 2 and 4-byte copy_*_user invocations.  On a fault
@@ -478,12 +480,43 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
	return __copy_from_user_ll(to, from, n);
}

#define ARCH_HAS_NOCACHE_UACCESS

static __always_inline unsigned long __copy_from_user_inatomic_nocache(void *to,
				const void __user *from, unsigned long n)
{
	if (__builtin_constant_p(n)) {
		unsigned long ret;

		switch (n) {
		case 1:
			__get_user_size(*(u8 *)to, from, 1, ret, 1);
			return ret;
		case 2:
			__get_user_size(*(u16 *)to, from, 2, ret, 2);
			return ret;
		case 4:
			__get_user_size(*(u32 *)to, from, 4, ret, 4);
			return ret;
		}
	}
	return __copy_from_user_ll_nocache(to, from, n);
}

static __always_inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
       might_sleep();
       return __copy_from_user_inatomic(to, from, n);
}

static __always_inline unsigned long
__copy_from_user_nocache(void *to, const void __user *from, unsigned long n)
{
       might_sleep();
       return __copy_from_user_inatomic_nocache(to, from, n);
}

unsigned long __must_check copy_to_user(void __user *to,
				const void *from, unsigned long n);
unsigned long __must_check copy_from_user(void *to,
+22 −0
Original line number Diff line number Diff line
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <asm/uaccess.h>

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
				const void __user *from, unsigned long n)
{
	return __copy_from_user_inatomic(to, from, n);
}

static inline unsigned long __copy_from_user_nocache(void *to,
				const void __user *from, unsigned long n)
{
	return __copy_from_user(to, from, n);
}

#endif		/* ARCH_HAS_NOCACHE_UACCESS */

#endif		/* __LINUX_UACCESS_H__ */
+2 −2
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/aio.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
 */
#include <linux/buffer_head.h> /* for generic_osync_inode */

#include <asm/uaccess.h>
#include <asm/mman.h>

static ssize_t
@@ -1902,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
		int copy = min(bytes, iov->iov_len - base);

		base = 0;
		left = __copy_from_user_inatomic(vaddr, buf, copy);
		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
		copied += copy;
		bytes -= copy;
		vaddr += copy;
+3 −3
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@
#include <linux/highmem.h>
#include <linux/uio.h>
#include <linux/config.h>
#include <asm/uaccess.h>
#include <linux/uaccess.h>

size_t
__filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
	int left;

	kaddr = kmap_atomic(page, KM_USER0);
	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
	left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
	kunmap_atomic(kaddr, KM_USER0);

	if (left != 0) {
		/* Do it the slow way */
		kaddr = kmap(page);
		left = __copy_from_user(kaddr + offset, buf, bytes);
		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
		kunmap(page);
	}
	return bytes - left;