Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c46a7c81 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds
Browse files

x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels



_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting
faults on x86.  Care is taken such that _PAGE_NUMA is used only in
situations where the VMA flags distinguish between NUMA hinting faults
and prot_none faults.  This decision was x86-specific and conceptually
it is difficult requiring special casing to distinguish between PROTNONE
and NUMA ptes based on context.

Fundamentally, we only need the _PAGE_NUMA bit to tell the difference
between an entry that is really unmapped and a page that is protected
for NUMA hinting faults as if the PTE is not present then a fault will
be trapped.

Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset.
This patch shrinks the maximum possible swap size and uses the bit to
uniquely distinguish between NUMA hinting ptes and swap ptes.

Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 4468dd76
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte)
	return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
}

#define pte_present_nonuma pte_present_nonuma
static inline int pte_present_nonuma(pte_t pte)
{
	return pte_val(pte) & (_PAGE_PRESENT);
}

#define pte_numa pte_numa
static inline int pte_numa(pte_t pte)
{
+11 −4
Original line number Diff line number Diff line
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)

static inline int pte_special(pte_t pte)
{
	return pte_flags(pte) & _PAGE_SPECIAL;
	return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
				 (_PAGE_PRESENT|_PAGE_SPECIAL);
}

static inline unsigned long pte_pfn(pte_t pte)
@@ -452,6 +453,12 @@ static inline int pte_present(pte_t a)
			       _PAGE_NUMA);
}

#define pte_present_nonuma pte_present_nonuma
static inline int pte_present_nonuma(pte_t a)
{
	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
	VM_BUG_ON(pte_present(pte));
	VM_BUG_ON(pte_present_nonuma(pte));
	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
	VM_BUG_ON(pte_present(pte));
	VM_BUG_ON(pte_present_nonuma(pte));
	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
	VM_BUG_ON(pte_present(pte));
	VM_BUG_ON(pte_present_nonuma(pte));
	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

+8 −0
Original line number Diff line number Diff line
@@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
#ifdef CONFIG_NUMA_BALANCING
/* Automatic NUMA balancing needs to be distinguishable from swap entries */
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
#else
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
#endif
#else
#ifdef CONFIG_NUMA_BALANCING
#error Incompatible format for automatic NUMA balancing
#endif
#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
#endif
+35 −31
Original line number Diff line number Diff line
@@ -16,15 +16,26 @@
#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT		7	/* on 4KB pages */
#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
#define _PAGE_BIT_UNUSED1	9	/* available for programmer */
#define _PAGE_BIT_IOMAP		10	/* flag used to indicate IO mapping */
#define _PAGE_BIT_HIDDEN	11	/* hidden by kmemcheck */
#define _PAGE_BIT_SOFTW1	9	/* available for programmer */
#define _PAGE_BIT_SOFTW2	10	/* " */
#define _PAGE_BIT_SOFTW3	11	/* " */
#define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
#define _PAGE_BIT_CPA_TEST	_PAGE_BIT_UNUSED1
#define _PAGE_BIT_SPLITTING	_PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
#define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
#define _PAGE_BIT_SPLITTING	_PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_IOMAP		_PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
#define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */

/*
 * Swap offsets on configurations that allow automatic NUMA balancing use the
 * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
 * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
 * maximum possible swap space from 16TB to 8TB.
 */
#define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)

/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
@@ -40,7 +51,7 @@
#define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
#define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
#define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@@ -61,14 +72,27 @@
 * they do not conflict with each other.
 */

#define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_HIDDEN

#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
#endif

/*
 * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
 * that is not present. The hinting fault gathers numa placement statistics
 * (see pte_numa()). The bit is always zero when the PTE is not present.
 *
 * The bit picked must be always zero when the pmd is present and not
 * present, so that we don't lose information when we set it while
 * atomically clearing the present bit.
 */
#ifdef CONFIG_NUMA_BALANCING
#define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
#else
#define _PAGE_NUMA	(_AT(pteval_t, 0))
#endif

/*
 * Tracking soft dirty bit when a page goes to a swap is tricky.
 * We need a bit which can be stored in pte _and_ not conflict
@@ -94,26 +118,6 @@
#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

/*
 * _PAGE_NUMA indicates that this page will trigger a numa hinting
 * minor page fault to gather numa placement statistics (see
 * pte_numa()). The bit picked (8) is within the range between
 * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
 * require changes to the swp entry format because that bit is always
 * zero when the pte is not present.
 *
 * The bit picked must be always zero when the pmd is present and not
 * present, so that we don't lose information when we set it while
 * atomically clearing the present bit.
 *
 * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
 * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
 * couldn't reach, like handle_mm_fault() (see access_error in
 * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
 * handle_mm_fault() to be invoked).
 */
#define _PAGE_NUMA	_PAGE_PROTNONE

#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
			 _PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
@@ -122,8 +126,8 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
			 _PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)

#define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB		(0)
+1 −1
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ enum {

static int pte_testbit(pte_t pte)
{
	return pte_flags(pte) & _PAGE_UNUSED1;
	return pte_flags(pte) & _PAGE_SOFTW1;
}

struct split_state {
Loading