Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 203b4fc9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Thomas Gleixner:

 - Make lazy TLB mode even lazier to avoid pointless switch_mm()
   operations, which reduces CPU load by 1-2% for memcache workloads

 - Small cleanups and improvements all over the place

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm: Remove redundant check for kmem_cache_create()
  arm/asm/tlb.h: Fix build error implicit func declaration
  x86/mm/tlb: Make clear_asid_other() static
  x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off()
  x86/mm/tlb: Always use lazy TLB mode
  x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Restructure switch_mm_irqs_off()
  x86/mm/tlb: Leave lazy TLB mode at page table free time
  mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids
  x86/mm: Add TLB purge to free pmd/pte page interfaces
  ioremap: Update pgtable free interfaces with addr
  x86/mm: Disable ioremap free page handling on x86-PAE
parents 7edcf0d3 765d28f1
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
{
}

static inline void tlb_flush_remove_tables(struct mm_struct *mm)
{
}

static inline void tlb_flush_remove_tables_local(void *arg)
{
}

#endif /* CONFIG_MMU */
#endif
+2 −2
Original line number Diff line number Diff line
@@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp)
	return 1;
}

int pud_free_pmd_page(pud_t *pud)
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
	return pud_none(*pud);
}

int pmd_free_pte_page(pmd_t *pmd)
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
	return pmd_none(*pmd);
}
+5 −16
Original line number Diff line number Diff line
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif

static inline bool tlb_defer_switch_to_init_mm(void)
{
	/*
	 * If we have PCID, then switching to init_mm is reasonably
	 * fast.  If we don't have PCID, then switching to init_mm is
	 * quite slow, so we try to defer it in the hopes that we can
	 * avoid it entirely.  The latter approach runs the risk of
	 * receiving otherwise unnecessary IPIs.
	 *
	 * This choice is just a heuristic.  The tlb code can handle this
	 * function returning true or false regardless of whether we have
	 * PCID.
	 */
	return !static_cpu_has(X86_FEATURE_PCID);
}

struct tlb_context {
	u64 ctx_id;
	u64 tlb_gen;
@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
	native_flush_tlb_others(mask, info)
#endif

extern void tlb_flush_remove_tables(struct mm_struct *mm);
extern void tlb_flush_remove_tables_local(void *arg);

#define HAVE_TLB_FLUSH_REMOVE_TABLES

#endif /* _ASM_X86_TLBFLUSH_H */
+53 −11
Original line number Diff line number Diff line
@@ -329,9 +329,6 @@ static int __init pgd_cache_init(void)
	 */
	pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
				      SLAB_PANIC, NULL);
	if (!pgd_cache)
		return -ENOMEM;

	return 0;
}
core_initcall(pgd_cache_init);
@@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd)
	return 0;
}

#ifdef CONFIG_X86_64
/**
 * pud_free_pmd_page - Clear pud entry and free pmd page.
 * @pud: Pointer to a PUD.
 * @addr: Virtual address associated with pud.
 *
 * Context: The pud range has been unmaped and TLB purged.
 * Context: The pud range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 *
 * NOTE: Callers must allow a single page allocation.
 */
int pud_free_pmd_page(pud_t *pud)
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
	pmd_t *pmd;
	pmd_t *pmd, *pmd_sv;
	pte_t *pte;
	int i;

	if (pud_none(*pud))
		return 1;

	pmd = (pmd_t *)pud_page_vaddr(*pud);

	for (i = 0; i < PTRS_PER_PMD; i++)
		if (!pmd_free_pte_page(&pmd[i]))
	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
	if (!pmd_sv)
		return 0;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd_sv[i] = pmd[i];
		if (!pmd_none(pmd[i]))
			pmd_clear(&pmd[i]);
	}

	pud_clear(pud);

	/* INVLPG to clear all paging-structure caches */
	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

	for (i = 0; i < PTRS_PER_PMD; i++) {
		if (!pmd_none(pmd_sv[i])) {
			pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
			free_page((unsigned long)pte);
		}
	}

	free_page((unsigned long)pmd_sv);
	free_page((unsigned long)pmd);

	return 1;
@@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud)
/**
 * pmd_free_pte_page - Clear pmd entry and free pte page.
 * @pmd: Pointer to a PMD.
 * @addr: Virtual address associated with pmd.
 *
 * Context: The pmd range has been unmaped and TLB purged.
 * Context: The pmd range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 */
int pmd_free_pte_page(pmd_t *pmd)
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
	pte_t *pte;

@@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd)

	pte = (pte_t *)pmd_page_vaddr(*pmd);
	pmd_clear(pmd);

	/* INVLPG to clear all paging-structure caches */
	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

	free_page((unsigned long)pte);

	return 1;
}

#else /* !CONFIG_X86_64 */

int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
	return pud_none(*pud);
}

/*
 * Disable free page handling on x86-PAE. This assures that ioremap()
 * does not update sync'd pmd entries. See vmalloc_sync_one().
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
	return pmd_none(*pmd);
}

#endif /* CONFIG_X86_64 */
#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
+165 −59
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/gfp.h>

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -35,7 +36,7 @@
 * necessary invalidation by clearing out the 'ctx_id' which
 * forces a TLB flush when the context is loaded.
 */
void clear_asid_other(void)
static void clear_asid_other(void)
{
	u16 asid;

@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
	unsigned cpu = smp_processor_id();
	u64 next_tlb_gen;
	bool need_flush;
	u16 new_asid;

	/*
	 * NB: The scheduler will call us with prev == next when switching
@@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
			   next->context.ctx_id);

		/*
		 * We don't currently support having a real mm loaded without
		 * our cpu set in mm_cpumask().  We have all the bookkeeping
		 * in place to figure out whether we would need to flush
		 * if our cpu were cleared in mm_cpumask(), but we don't
		 * currently use it.
		 * Even in lazy TLB mode, the CPU should stay set in the
		 * mm_cpumask. The TLB shootdown code can figure out from
		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
		 */
		if (WARN_ON_ONCE(real_prev != &init_mm &&
				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
			cpumask_set_cpu(cpu, mm_cpumask(next));

		/*
		 * If the CPU is not in lazy TLB mode, we are just switching
		 * from one thread in a process to another thread in the same
		 * process. No TLB flush required.
		 */
		if (!was_lazy)
			return;

		/*
		 * Read the tlb_gen to check whether a flush is needed.
		 * If the TLB is up to date, just use it.
		 * The barrier synchronizes with the tlb_gen increment in
		 * the TLB shootdown code.
		 */
		smp_mb();
		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
				next_tlb_gen)
			return;

		/*
		 * TLB contents went out of date while we were in lazy
		 * mode. Fall through to the TLB switching code below.
		 */
		new_asid = prev_asid;
		need_flush = true;
	} else {
		u16 new_asid;
		bool need_flush;
		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);

		/*
@@ -285,18 +310,26 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
			sync_current_stack_to_mm(next);
		}

		/* Stop remote flushes for the previous mm */
		VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
				real_prev != &init_mm);
		/*
		 * Stop remote flushes for the previous mm.
		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
		 * but the bitmap manipulation can cause cache line contention.
		 */
		if (real_prev != &init_mm) {
			VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
						mm_cpumask(real_prev)));
			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
		}

		/*
		 * Start remote flushes and then read tlb_gen.
		 */
		if (next != &init_mm)
			cpumask_set_cpu(cpu, mm_cpumask(next));
		next_tlb_gen = atomic64_read(&next->context.tlb_gen);

		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
	}

	if (need_flush) {
		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
@@ -330,7 +363,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

	this_cpu_write(cpu_tlbstate.loaded_mm, next);
	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
	}

	load_mm_cr4(next);
	switch_ldt(real_prev, next);
@@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
		return;

	if (tlb_defer_switch_to_init_mm()) {
		/*
		 * There's a significant optimization that may be possible
		 * here.  We have accurate enough TLB flush tracking that we
		 * don't need to maintain coherence of TLB per se when we're
		 * lazy.  We do, however, need to maintain coherence of
		 * paging-structure caches.  We could, in principle, leave our
		 * old mm loaded and only switch to init_mm when
		 * tlb_remove_page() happens.
		 */
	this_cpu_write(cpu_tlbstate.is_lazy, true);
	} else {
		switch_mm(NULL, &init_mm, NULL);
	}
}

/*
@@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
		 * paging-structure cache to avoid speculatively reading
		 * garbage into our TLB.  Since switching to init_mm is barely
		 * slower than a minimal flush, just switch to init_mm.
		 *
		 * This should be rare, with native_flush_tlb_others skipping
		 * IPIs to lazy TLB mode CPUs.
		 */
		switch_mm_irqs_off(NULL, &init_mm, NULL);
		return;
@@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
			     const struct flush_tlb_info *info)
{
	cpumask_var_t lazymask;
	unsigned int cpu;

	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
	if (info->end == TLB_FLUSH_ALL)
		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
		 * that UV should be updated so that smp_call_function_many(),
		 * etc, are optimal on UV.
		 */
		unsigned int cpu;

		cpu = smp_processor_id();
		cpumask = uv_flush_tlb_others(cpumask, info);
		if (cpumask)
@@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
					       (void *)info, 1);
		return;
	}

	/*
	 * A temporary cpumask is used in order to skip sending IPIs
	 * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
	 * If the allocation fails, simply IPI every CPU in mm_cpumask.
	 */
	if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
		smp_call_function_many(cpumask, flush_tlb_func_remote,
			       (void *)info, 1);
		return;
	}

	cpumask_copy(lazymask, cpumask);

	for_each_cpu(cpu, lazymask) {
		if (per_cpu(cpu_tlbstate.is_lazy, cpu))
			cpumask_clear_cpu(cpu, lazymask);
	}

	smp_call_function_many(lazymask, flush_tlb_func_remote,
			       (void *)info, 1);

	free_cpumask_var(lazymask);
}

/*
@@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
	put_cpu();
}

void tlb_flush_remove_tables_local(void *arg)
{
	struct mm_struct *mm = arg;

	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
			this_cpu_read(cpu_tlbstate.is_lazy)) {
		/*
		 * We're in lazy mode.  We need to at least flush our
		 * paging-structure cache to avoid speculatively reading
		 * garbage into our TLB.  Since switching to init_mm is barely
		 * slower than a minimal flush, just switch to init_mm.
		 */
		switch_mm_irqs_off(NULL, &init_mm, NULL);
	}
}

static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
				      struct cpumask *lazy_cpus)
{
	int cpu;

	for_each_cpu(cpu, mm_cpumask(mm)) {
		if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
			cpumask_set_cpu(cpu, lazy_cpus);
	}
}

void tlb_flush_remove_tables(struct mm_struct *mm)
{
	int cpu = get_cpu();
	cpumask_var_t lazy_cpus;

	if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
		put_cpu();
		return;
	}

	if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
		/*
		 * If the cpumask allocation fails, do a brute force flush
		 * on all the CPUs that have this mm loaded.
		 */
		smp_call_function_many(mm_cpumask(mm),
				tlb_flush_remove_tables_local, (void *)mm, 1);
		put_cpu();
		return;
	}

	/*
	 * CPUs with !is_lazy either received a TLB flush IPI while the user
	 * pages in this address range were unmapped, or have context switched
	 * and reloaded %CR3 since then.
	 *
	 * Shootdown IPIs at page table freeing time only need to be sent to
	 * CPUs that may have out of date TLB contents.
	 */
	mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
	smp_call_function_many(lazy_cpus,
				tlb_flush_remove_tables_local, (void *)mm, 1);
	free_cpumask_var(lazy_cpus);
	put_cpu();
}

static void do_flush_tlb_all(void *info)
{
Loading