Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 723cacbd authored by Gerald Schaefer's avatar Gerald Schaefer Committed by Martin Schwidefsky
Browse files

s390/mm: fix asce_bits handling with dynamic pagetable levels



There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.

Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.

Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.

Reported-by: default avatarMichael Munday <munday@ca.ibm.com>
Reviewed-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: default avatarGerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
parent dba59909
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ typedef struct {
	spinlock_t list_lock;
	struct list_head pgtable_list;
	struct list_head gmap_list;
	unsigned long asce_bits;
	unsigned long asce;
	unsigned long asce_limit;
	unsigned long vdso_base;
	/* The mmu context allocates 4K page tables. */
+22 −6
Original line number Diff line number Diff line
@@ -26,12 +26,28 @@ static inline int init_new_context(struct task_struct *tsk,
	mm->context.has_pgste = 0;
	mm->context.use_skey = 0;
#endif
	if (mm->context.asce_limit == 0) {
	switch (mm->context.asce_limit) {
	case 1UL << 42:
		/*
		 * forked 3-level task, fall through to set new asce with new
		 * mm->pgd
		 */
	case 0:
		/* context created by exec, set asce limit to 4TB */
		mm->context.asce_bits = _ASCE_TABLE_LENGTH |
			_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
		mm->context.asce_limit = STACK_TOP_MAX;
	} else if (mm->context.asce_limit == (1UL << 31)) {
		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
		break;
	case 1UL << 53:
		/* forked 4-level task, set new asce with new mm->pgd */
		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
				   _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
		break;
	case 1UL << 31:
		/* forked 2-level compat task, set new asce with new mm->pgd */
		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
		/* pgd_alloc() did not increase mm->nr_pmds */
		mm_inc_nr_pmds(mm);
	}
	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
@@ -42,7 +58,7 @@ static inline int init_new_context(struct task_struct *tsk,

static inline void set_user_asce(struct mm_struct *mm)
{
	S390_lowcore.user_asce = mm->context.asce_bits | __pa(mm->pgd);
	S390_lowcore.user_asce = mm->context.asce;
	if (current->thread.mm_segment.ar4)
		__ctl_load(S390_lowcore.user_asce, 7, 7);
	set_cpu_flag(CIF_ASCE);
@@ -71,7 +87,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
{
	int cpu = smp_processor_id();

	S390_lowcore.user_asce = next->context.asce_bits | __pa(next->pgd);
	S390_lowcore.user_asce = next->context.asce;
	if (prev == next)
		return;
	if (MACHINE_HAS_TLB_LC)
+2 −2
Original line number Diff line number Diff line
@@ -52,8 +52,8 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm)
	return _REGION2_ENTRY_EMPTY;
}

int crst_table_upgrade(struct mm_struct *, unsigned long limit);
void crst_table_downgrade(struct mm_struct *, unsigned long limit);
int crst_table_upgrade(struct mm_struct *);
void crst_table_downgrade(struct mm_struct *);

static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
{
+1 −1
Original line number Diff line number Diff line
@@ -175,7 +175,7 @@ extern __vector128 init_task_fpu_regs[__NUM_VXRS];
	regs->psw.mask	= PSW_USER_BITS | PSW_MASK_BA;			\
	regs->psw.addr	= new_psw;					\
	regs->gprs[15]	= new_stackp;					\
	crst_table_downgrade(current->mm, 1UL << 31);			\
	crst_table_downgrade(current->mm);				\
	execve_tail();							\
} while (0)

+3 −6
Original line number Diff line number Diff line
@@ -110,8 +110,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
static inline void __tlb_flush_kernel(void)
{
	if (MACHINE_HAS_IDTE)
		__tlb_flush_idte((unsigned long) init_mm.pgd |
				 init_mm.context.asce_bits);
		__tlb_flush_idte(init_mm.context.asce);
	else
		__tlb_flush_global();
}
@@ -133,8 +132,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
static inline void __tlb_flush_kernel(void)
{
	if (MACHINE_HAS_TLB_LC)
		__tlb_flush_idte_local((unsigned long) init_mm.pgd |
				       init_mm.context.asce_bits);
		__tlb_flush_idte_local(init_mm.context.asce);
	else
		__tlb_flush_local();
}
@@ -148,8 +146,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm)
	 * only ran on the local cpu.
	 */
	if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list))
		__tlb_flush_asce(mm, (unsigned long) mm->pgd |
				 mm->context.asce_bits);
		__tlb_flush_asce(mm, mm->context.asce);
	else
		__tlb_flush_full(mm);
}
Loading