x86/mm: Rework lazy TLB to track the actual loaded mm (3d28ebce) · Commits · e / devices / android_kernel_fairphone_FP4

arch/x86/events/core.c

+1 −2

Original line number	Diff line number	Diff line
		@@ -2101,8 +2101,7 @@ static int x86_pmu_event_init(struct perf_event *event)

		static void refresh_pce(void *ignored)
		{
		if (current->active_mm)
		load_mm_cr4(current->active_mm);
		load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
		}

		static void x86_pmu_event_mapped(struct perf_event *event)

arch/x86/include/asm/tlbflush.h

+10 −2

Original line number	Diff line number	Diff line
		@@ -66,7 +66,13 @@ static inline void invpcid_flush_all_nonglobals(void)
		#endif

		struct tlb_state {
		struct mm_struct *active_mm;
		/*
		* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
		* are on. This means that it may not match current->active_mm,
		* which will contain the previous user mm when we're in lazy TLB
		* mode even if we've already switched back to swapper_pg_dir.
		*/
		struct mm_struct *loaded_mm;
		int state;

		/*
		@@ -256,7 +262,9 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
		static inline void reset_lazy_tlbstate(void)
		{
		this_cpu_write(cpu_tlbstate.state, 0);
		this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
		this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);

		WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
		}

		static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,

arch/x86/kernel/ldt.c

+4 −3

Original line number	Diff line number	Diff line
		@@ -22,14 +22,15 @@
		#include <asm/syscalls.h>

		/* context.lock is held for us, so we don't need any locking. */
		static void flush_ldt(void *current_mm)
		static void flush_ldt(void *__mm)
		{
		struct mm_struct *mm = __mm;
		mm_context_t *pc;

		if (current->active_mm != current_mm)
		if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
		return;

		pc = &current->active_mm->context;
		pc = &mm->context;
		set_ldt(pc->ldt->entries, pc->ldt->size);
		}

arch/x86/mm/init.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -811,7 +811,7 @@ void __init zone_sizes_init(void)
		}

		DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
		.active_mm = &init_mm,
		.loaded_mm = &init_mm,
		.state = 0,
		.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
		};

arch/x86/mm/tlb.c

+108 −108

Original line number	Diff line number	Diff line
		@@ -28,26 +28,25 @@
		* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
		*/

		/*
		* We cannot call mmdrop() because we are in interrupt context,
		* instead update mm->cpu_vm_mask.
		*/
		void leave_mm(int cpu)
		{
		struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
		BUG();
		if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
		load_cr3(swapper_pg_dir);
		struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);

		/*
		* This gets called in the idle path where RCU
		* functions differently. Tracing normally
		* uses RCU, so we have to call the tracepoint
		* specially here.
		* It's plausible that we're in lazy TLB mode while our mm is init_mm.
		* If so, our callers still expect us to flush the TLB, but there
		* aren't any user TLB entries in init_mm to worry about.
		*
		* This needs to happen before any other sanity checks due to
		* intel_idle's shenanigans.
		*/
		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
		}
		if (loaded_mm == &init_mm)
		return;

		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
		BUG();

		switch_mm(NULL, &init_mm, NULL);
		}
		EXPORT_SYMBOL_GPL(leave_mm);

		@@ -65,8 +64,30 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
		struct task_struct *tsk)
		{
		unsigned cpu = smp_processor_id();
		struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);

		/*
		* NB: The scheduler will call us with prev == next when
		* switching from lazy TLB mode to normal mode if active_mm
		* isn't changing. When this happens, there is no guarantee
		* that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
		*
		* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
		*/

		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);

		if (real_prev == next) {
		/*
		* There's nothing to do: we always keep the per-mm control
		* regs in sync with cpu_tlbstate.loaded_mm. Just
		* sanity-check mm_cpumask.
		*/
		if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
		cpumask_set_cpu(cpu, mm_cpumask(next));
		return;
		}

		if (likely(prev != next)) {
		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
		/*
		* If our current stack is in vmalloc space and isn't
		@@ -81,9 +102,9 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
		set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
		}

		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		this_cpu_write(cpu_tlbstate.active_mm, next);
		this_cpu_write(cpu_tlbstate.loaded_mm, next);

		WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
		cpumask_set_cpu(cpu, mm_cpumask(next));

		/*
		@@ -112,14 +133,20 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
		* and neither LOCK nor MFENCE orders them.
		* Fortunately, load_cr3() is serializing and gives the
		* ordering guarantee we need.
		*
		*/
		load_cr3(next->pgd);

		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
		/*
		* This gets called via leave_mm() in the idle path where RCU
		* functions differently. Tracing normally uses RCU, so we have to
		* call the tracepoint specially here.
		*/
		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

		/* Stop flush ipis for the previous mm */
		cpumask_clear_cpu(cpu, mm_cpumask(prev));
		WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
		real_prev != &init_mm);
		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));

		/* Load per-mm CR4 state */
		load_mm_cr4(next);
		@@ -137,36 +164,9 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
		* exists. That means that next->context.ldt !=
		* prev->context.ldt, because mms never share an LDT.
		*/
		if (unlikely(prev->context.ldt != next->context.ldt))
		if (unlikely(real_prev->context.ldt != next->context.ldt))
		load_mm_ldt(next);
		#endif
		} else {
		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);

		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
		/*
		* On established mms, the mm_cpumask is only changed
		* from irq context, from ptep_clear_flush() while in
		* lazy tlb mode, and here. Irqs are blocked during
		* schedule, protecting us from simultaneous changes.
		*/
		cpumask_set_cpu(cpu, mm_cpumask(next));

		/*
		* We were in lazy tlb mode and leave_mm disabled
		* tlb flush IPI delivery. We must reload CR3
		* to make sure to use no freed page tables.
		*
		* As above, load_cr3() is serializing and orders TLB
		* fills with respect to the mm_cpumask write.
		*/
		load_cr3(next->pgd);
		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
		load_mm_cr4(next);
		load_mm_ldt(next);
		}
		}
		}

		/*
		@@ -246,7 +246,7 @@ static void flush_tlb_func_remote(void *info)

		inc_irq_stat(irq_tlb_count);

		if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.active_mm))
		if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
		return;

		count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
		@@ -314,7 +314,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
		info.end = TLB_FLUSH_ALL;
		}

		if (mm == current->active_mm)
		if (mm == this_cpu_read(cpu_tlbstate.loaded_mm))
		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
		if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
		flush_tlb_others(mm_cpumask(mm), &info);