Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 85bcfaf6 authored by Nicholas Piggin's avatar Nicholas Piggin Committed by Michael Ellerman
Browse files

powerpc/64s/radix: optimise pte_update



Implementing pte_update with pte_xchg (which uses cmpxchg) is
inefficient. A single larx/stcx. works fine, no need for the less
efficient cmpxchg sequence.

Then remove the memory barriers from the operation. There is a
requirement for TLB flushing to load mm_cpumask after the store
that reduces pte permissions, which is moved into the TLB flush
code.

Signed-off-by: default avatarNicholas Piggin <npiggin@gmail.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent f1cb8f9b
Loading
Loading
Loading
Loading
+13 −12
Original line number Diff line number Diff line
@@ -131,20 +131,21 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep
static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
					       unsigned long set)
{
	pte_t pte;
	unsigned long old_pte, new_pte;

	do {
		pte = READ_ONCE(*ptep);
		old_pte = pte_val(pte);
		new_pte = (old_pte | set) & ~clr;

	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));

	return old_pte;
	__be64 old_be, tmp_be;

	__asm__ __volatile__(
	"1:	ldarx	%0,0,%3		# pte_update\n"
	"	andc	%1,%0,%5	\n"
	"	or	%1,%1,%4	\n"
	"	stdcx.	%1,0,%3		\n"
	"	bne-	1b"
	: "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep)
	: "r" (ptep), "r" (cpu_to_be64(set)), "r" (cpu_to_be64(clr))
	: "cc" );

	return be64_to_cpu(old_be);
}


static inline unsigned long radix__pte_update(struct mm_struct *mm,
					unsigned long addr,
					pte_t *ptep, unsigned long clr,
+4 −2
Original line number Diff line number Diff line
@@ -57,8 +57,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
		 * in switch_slb(), and/or the store of paca->mm_ctx_id in
		 * copy_mm_to_paca().
		 *
		 * On the read side the barrier is in pte_xchg(), which orders
		 * the store to the PTE vs the load of mm_cpumask.
		 * On the other side, the barrier is in mm/tlb-radix.c for
		 * radix which orders earlier stores to clear the PTEs vs
		 * the load of mm_cpumask. And pte_xchg which does the same
		 * thing for hash.
		 *
		 * This full barrier is needed by membarrier when switching
		 * between processes after store to rq->curr, before user-space
+10 −1
Original line number Diff line number Diff line
@@ -524,6 +524,11 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
		return;

	preempt_disable();
	/*
	 * Order loads of mm_cpumask vs previous stores to clear ptes before
	 * the invalidate. See barrier in switch_mm_irqs_off
	 */
	smp_mb();
	if (!mm_is_thread_local(mm)) {
		if (mm_needs_flush_escalation(mm))
			_tlbie_pid(pid, RIC_FLUSH_ALL);
@@ -544,6 +549,7 @@ void radix__flush_all_mm(struct mm_struct *mm)
		return;

	preempt_disable();
	smp_mb(); /* see radix__flush_tlb_mm */
	if (!mm_is_thread_local(mm))
		_tlbie_pid(pid, RIC_FLUSH_ALL);
	else
@@ -568,6 +574,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
		return;

	preempt_disable();
	smp_mb(); /* see radix__flush_tlb_mm */
	if (!mm_is_thread_local(mm))
		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
	else
@@ -630,6 +637,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
		return;

	preempt_disable();
	smp_mb(); /* see radix__flush_tlb_mm */
	if (mm_is_thread_local(mm)) {
		local = true;
		full = (end == TLB_FLUSH_ALL ||
@@ -791,6 +799,7 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
		return;

	preempt_disable();
	smp_mb(); /* see radix__flush_tlb_mm */
	if (mm_is_thread_local(mm)) {
		local = true;
		full = (end == TLB_FLUSH_ALL ||
@@ -849,7 +858,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)

	/* Otherwise first do the PWC, then iterate the pages. */
	preempt_disable();

	smp_mb(); /* see radix__flush_tlb_mm */
	if (mm_is_thread_local(mm)) {
		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
	} else {