Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3ceaccdf authored by Dave Hansen's avatar Dave Hansen Committed by Ingo Molnar
Browse files

x86/mpx: Rewrite the unmap code



The MPX code needs to clear out bounds tables for memory which
is no longer in use.  We do this when a userspace mapping is
torn down (unmapped).

There are two modes:

  1. An entire bounds table becomes unused, and can be freed
     and its pointer removed from the bounds directory.  This
     happens either when a large mapping is torn down, or when
     a small mapping is torn down and it is the last mapping
     "covered" by a bounds table.

  2. Only part of a bounds table becomes unused, in which case
     we free the backing memory as if MADV_DONTNEED was called.

The old code was a spaghetti mess of "edge" bounds tables
where the edges were handled specially, even if we were
unmapping an entire one.  Non-edge bounds tables are always
fully unmapped, but share a different code path from the edge
ones.  The old code had a bug where it was unmapping too much
memory.  I worked on fixing it for two days and gave up.

I didn't write the original code.  I didn't particularly like
it, but it worked, so I left it.  After my debug session, I
realized it was undebuggagle *and* buggy, so out it went.

I also wrote a new unmapping test program which uncovers bugs
pretty nicely.

Signed-off-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: default avatarThomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150607183706.DCAEC67D@viggo.jf.intel.com


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 613fcb7d
Loading
Loading
Loading
Loading
+168 −243
Original line number Diff line number Diff line
@@ -704,110 +704,6 @@ static int get_bt_addr(struct mm_struct *mm,
	return 0;
}

/*
 * Free the backing physical pages of bounds table 'bt_addr'.
 * Assume start...end is within that bounds table.
 */
static int zap_bt_entries(struct mm_struct *mm,
		unsigned long bt_addr,
		unsigned long start, unsigned long end)
{
	struct vm_area_struct *vma;
	unsigned long addr, len;

	/*
	 * Find the first overlapping vma. If vma->vm_start > start, there
	 * will be a hole in the bounds table. This -EINVAL return will
	 * cause a SIGSEGV.
	 */
	vma = find_vma(mm, start);
	if (!vma || vma->vm_start > start)
		return -EINVAL;

	/*
	 * A NUMA policy on a VM_MPX VMA could cause this bouds table to
	 * be split. So we need to look across the entire 'start -> end'
	 * range of this bounds table, find all of the VM_MPX VMAs, and
	 * zap only those.
	 */
	addr = start;
	while (vma && vma->vm_start < end) {
		/*
		 * We followed a bounds directory entry down
		 * here.  If we find a non-MPX VMA, that's bad,
		 * so stop immediately and return an error.  This
		 * probably results in a SIGSEGV.
		 */
		if (!is_mpx_vma(vma))
			return -EINVAL;

		len = min(vma->vm_end, end) - addr;
		zap_page_range(vma, addr, len, NULL);
		trace_mpx_unmap_zap(addr, addr+len);

		vma = vma->vm_next;
		addr = vma->vm_start;
	}

	return 0;
}

static int unmap_single_bt(struct mm_struct *mm,
		long __user *bd_entry, unsigned long bt_addr)
{
	unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
	unsigned long uninitialized_var(actual_old_val);
	int ret;

	while (1) {
		int need_write = 1;
		unsigned long cleared_bd_entry = 0;

		pagefault_disable();
		ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
				bd_entry, expected_old_val, cleared_bd_entry);
		pagefault_enable();
		if (!ret)
			break;
		if (ret == -EFAULT)
			ret = mpx_resolve_fault(bd_entry, need_write);
		/*
		 * If we could not resolve the fault, consider it
		 * userspace's fault and error out.
		 */
		if (ret)
			return ret;
	}
	/*
	 * The cmpxchg was performed, check the results.
	 */
	if (actual_old_val != expected_old_val) {
		/*
		 * Someone else raced with us to unmap the table.
		 * There was no bounds table pointed to by the
		 * directory, so declare success.  Somebody freed
		 * it.
		 */
		if (!actual_old_val)
			return 0;
		/*
		 * Something messed with the bounds directory
		 * entry.  We hold mmap_sem for read or write
		 * here, so it could not be a _new_ bounds table
		 * that someone just allocated.  Something is
		 * wrong, so pass up the error and SIGSEGV.
		 */
		return -EINVAL;
	}

	/*
	 * Note, we are likely being called under do_munmap() already. To
	 * avoid recursion, do_munmap() will check whether it comes
	 * from one bounds table through VM_MPX flag.
	 */
	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
}

static inline int bt_entry_size_bytes(struct mm_struct *mm)
{
	if (is_64bit_mm(mm))
@@ -872,13 +768,69 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
}

/*
 * Return an offset in terms of bytes in to the bounds
 * directory where the bounds directory entry for a given
 * virtual address resides.
 *
 * This has to be in bytes because the directory entries
 * are different sizes on 64/32 bit.
 * Free the backing physical pages of bounds table 'bt_addr'.
 * Assume start...end is within that bounds table.
 */
static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
		unsigned long bt_addr,
		unsigned long start_mapping, unsigned long end_mapping)
{
	struct vm_area_struct *vma;
	unsigned long addr, len;
	unsigned long start;
	unsigned long end;

	/*
	 * if we 'end' on a boundary, the offset will be 0 which
	 * is not what we want.  Back it up a byte to get the
	 * last bt entry.  Then once we have the entry itself,
	 * move 'end' back up by the table entry size.
	 */
	start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
	end   = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
	/*
	 * Move end back up by one entry.  Among other things
	 * this ensures that it remains page-aligned and does
	 * not screw up zap_page_range()
	 */
	end += bt_entry_size_bytes(mm);

	/*
	 * Find the first overlapping vma. If vma->vm_start > start, there
	 * will be a hole in the bounds table. This -EINVAL return will
	 * cause a SIGSEGV.
	 */
	vma = find_vma(mm, start);
	if (!vma || vma->vm_start > start)
		return -EINVAL;

	/*
	 * A NUMA policy on a VM_MPX VMA could cause this bounds table to
	 * be split. So we need to look across the entire 'start -> end'
	 * range of this bounds table, find all of the VM_MPX VMAs, and
	 * zap only those.
	 */
	addr = start;
	while (vma && vma->vm_start < end) {
		/*
		 * We followed a bounds directory entry down
		 * here.  If we find a non-MPX VMA, that's bad,
		 * so stop immediately and return an error.  This
		 * probably results in a SIGSEGV.
		 */
		if (!is_mpx_vma(vma))
			return -EINVAL;

		len = min(vma->vm_end, end) - addr;
		zap_page_range(vma, addr, len, NULL);
		trace_mpx_unmap_zap(addr, addr+len);

		vma = vma->vm_next;
		addr = vma->vm_start;
	}
	return 0;
}

static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
		unsigned long addr)
{
@@ -916,69 +868,80 @@ static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
	 */
}

/*
 * If the bounds table pointed by bounds directory 'bd_entry' is
 * not shared, unmap this whole bounds table. Otherwise, only free
 * those backing physical pages of bounds table entries covered
 * in this virtual address region start...end.
 */
static int unmap_shared_bt(struct mm_struct *mm,
		long __user *bd_entry, unsigned long start,
		unsigned long end, bool prev_shared, bool next_shared)
static int unmap_entire_bt(struct mm_struct *mm,
		long __user *bd_entry, unsigned long bt_addr)
{
	unsigned long bt_addr;
	unsigned long start_off, end_off;
	unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
	unsigned long uninitialized_var(actual_old_val);
	int ret;

	ret = get_bt_addr(mm, bd_entry, &bt_addr);
	while (1) {
		int need_write = 1;
		unsigned long cleared_bd_entry = 0;

		pagefault_disable();
		ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
				bd_entry, expected_old_val, cleared_bd_entry);
		pagefault_enable();
		if (!ret)
			break;
		if (ret == -EFAULT)
			ret = mpx_resolve_fault(bd_entry, need_write);
		/*
	 * We could see an "error" ret for not-present bounds
	 * tables (not really an error), or actual errors, but
	 * stop unmapping either way.
		 * If we could not resolve the fault, consider it
		 * userspace's fault and error out.
		 */
		if (ret)
			return ret;

	start_off = mpx_get_bt_entry_offset_bytes(mm, start);
	end_off   = mpx_get_bt_entry_offset_bytes(mm, end);

	if (prev_shared && next_shared)
		ret = zap_bt_entries(mm, bt_addr,
				bt_addr + start_off,
				bt_addr + end_off);
	else if (prev_shared)
		ret = zap_bt_entries(mm, bt_addr,
				bt_addr + start_off,
				bt_addr + mpx_bt_size_bytes(mm));
	else if (next_shared)
		ret = zap_bt_entries(mm, bt_addr, bt_addr,
				bt_addr + end_off);
	else
		ret = unmap_single_bt(mm, bd_entry, bt_addr);

	return ret;
	}

	/*
 * A virtual address region being munmap()ed might share bounds table
 * with adjacent VMAs. We only need to free the backing physical
 * memory of these shared bounds tables entries covered in this virtual
 * address region.
	 * The cmpxchg was performed, check the results.
	 */
static int unmap_edge_bts(struct mm_struct *mm,
	if (actual_old_val != expected_old_val) {
		/*
		 * Someone else raced with us to unmap the table.
		 * That is OK, since we were both trying to do
		 * the same thing.  Declare success.
		 */
		if (!actual_old_val)
			return 0;
		/*
		 * Something messed with the bounds directory
		 * entry.  We hold mmap_sem for read or write
		 * here, so it could not be a _new_ bounds table
		 * that someone just allocated.  Something is
		 * wrong, so pass up the error and SIGSEGV.
		 */
		return -EINVAL;
	}
	/*
	 * Note, we are likely being called under do_munmap() already. To
	 * avoid recursion, do_munmap() will check whether it comes
	 * from one bounds table through VM_MPX flag.
	 */
	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
}

static int try_unmap_single_bt(struct mm_struct *mm,
	       unsigned long start, unsigned long end)
{
	struct vm_area_struct *next;
	struct vm_area_struct *prev;
	/*
	 * "bta" == Bounds Table Area: the area controlled by the
	 * bounds table that we are unmapping.
	 */
	unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
	unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
	unsigned long uninitialized_var(bt_addr);
	void __user *bde_vaddr;
	int ret;
	long __user *bde_start, *bde_end;
	struct vm_area_struct *prev, *next;
	bool prev_shared = false, next_shared = false;

	bde_start = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
	bde_end   = mm->bd_addr + mpx_get_bd_entry_offset(mm, end-1);

	/*
	 * Check whether bde_start and bde_end are shared with adjacent
	 * VMAs.
	 * We know 'start' and 'end' lie within an area controlled
	 * by a single bounds table.  See if there are any other
	 * VMAs controlled by that bounds table.  If there are not
	 * then we can "expand" the are we are unmapping to possibly
	 * cover the entire table.
	 *
	 * We already unliked the VMAs from the mm's rbtree so 'start'
	 * is guaranteed to be in a hole. This gets us the first VMA
@@ -986,102 +949,64 @@ static int unmap_edge_bts(struct mm_struct *mm,
	 * in to 'next'.
	 */
	next = find_vma_prev(mm, start, &prev);
	if (prev && (mm->bd_addr + mpx_get_bd_entry_offset(mm, prev->vm_end-1))
			== bde_start)
		prev_shared = true;
	if (next && (mm->bd_addr + mpx_get_bd_entry_offset(mm, next->vm_start))
			== bde_end)
		next_shared = true;

	if ((!prev || prev->vm_end <= bta_start_vaddr) &&
	    (!next || next->vm_start >= bta_end_vaddr)) {
		/*
	 * This virtual address region being munmap()ed is only
	 * covered by one bounds table.
	 *
	 * In this case, if this table is also shared with adjacent
	 * VMAs, only part of the backing physical memory of the bounds
	 * table need be freeed. Otherwise the whole bounds table need
	 * be unmapped.
	 */
	if (bde_start == bde_end) {
		return unmap_shared_bt(mm, bde_start, start, end,
				prev_shared, next_shared);
		 * No neighbor VMAs controlled by same bounds
		 * table.  Try to unmap the whole thing
		 */
		start = bta_start_vaddr;
		end = bta_end_vaddr;
	}

	bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
	ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
	/*
	 * If more than one bounds tables are covered in this virtual
	 * address region being munmap()ed, we need to separately check
	 * whether bde_start and bde_end are shared with adjacent VMAs.
	 * No bounds table there, so nothing to unmap.
	 */
	ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
	if (ret)
		return ret;
	ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
	if (ret == -ENOENT) {
		ret = 0;
		return 0;
	}
	if (ret)
		return ret;

	return 0;
	/*
	 * We are unmapping an entire table.  Either because the
	 * unmap that started this whole process was large enough
	 * to cover an entire table, or that the unmap was small
	 * but was the area covered by a bounds table.
	 */
	if ((start == bta_start_vaddr) &&
	    (end == bta_end_vaddr))
		return unmap_entire_bt(mm, bde_vaddr, bt_addr);
	return zap_bt_entries_mapping(mm, bt_addr, start, end);
}

static int mpx_unmap_tables(struct mm_struct *mm,
		unsigned long start, unsigned long end)
{
	int ret;
	long __user *bd_entry, *bde_start, *bde_end;
	unsigned long bt_addr;

	unsigned long one_unmap_start;
	trace_mpx_unmap_search(start, end);
	/*
	 * "Edge" bounds tables are those which are being used by the region
	 * (start -> end), but that may be shared with adjacent areas.  If they
	 * turn out to be completely unshared, they will be freed.  If they are
	 * shared, we will free the backing store (like an MADV_DONTNEED) for
	 * areas used by this region.
	 */
	ret = unmap_edge_bts(mm, start, end);
	switch (ret) {
		/* non-present tables are OK */
		case 0:
		case -ENOENT:
			/* Success, or no tables to unmap */
			break;
		case -EINVAL:
		case -EFAULT:
		default:
			return ret;
	}

	one_unmap_start = start;
	while (one_unmap_start < end) {
		int ret;
		unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
						       bd_entry_virt_space(mm));
		unsigned long one_unmap_end = end;
		/*
	 * Only unmap the bounds table that are
	 *   1. fully covered
	 *   2. not at the edges of the mapping, even if full aligned
	 */
	bde_start = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
	bde_end   = mm->bd_addr + mpx_get_bd_entry_offset(mm, end-1);
	for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
		ret = get_bt_addr(mm, bd_entry, &bt_addr);
		switch (ret) {
			case 0:
				break;
			case -ENOENT:
				/* No table here, try the next one */
				continue;
			case -EINVAL:
			case -EFAULT:
			default:
				/*
				 * Note: we are being strict here.
				 * Any time we run in to an issue
				 * unmapping tables, we stop and
				 * SIGSEGV.
		 * if the end is beyond the current bounds table,
		 * move it back so we only deal with a single one
		 * at a time
		 */
				return ret;
		}

		ret = unmap_single_bt(mm, bd_entry, bt_addr);
		if (one_unmap_end > next_unmap_start)
			one_unmap_end = next_unmap_start;
		ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
		if (ret)
			return ret;
	}

		one_unmap_start = next_unmap_start;
	}
	return 0;
}