Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 478a1469 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull DAX locking updates from Ross Zwisler:
 "Filesystem DAX locking for 4.7

   - We use a bit in an exceptional radix tree entry as a lock bit and
     use it similarly to how page lock is used for normal faults.  This
     fixes races between hole instantiation and read faults of the same
     index.

   - Filesystem DAX PMD faults are disabled, and will be re-enabled when
     PMD locking is implemented"

* tag 'dax-locking-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  dax: Remove i_mmap_lock protection
  dax: Use radix tree entry lock to protect cow faults
  dax: New fault locking
  dax: Allow DAX code to replace exceptional entries
  dax: Define DAX lock bit for radix tree exceptional entry
  dax: Make huge page handling depend of CONFIG_BROKEN
  dax: Fix condition for filling of PMD holes
parents 315227f6 4d9a2c87
Loading
Loading
Loading
Loading
+1 −0
Original line number Original line Diff line number Diff line
@@ -52,6 +52,7 @@ config FS_DAX_PMD
	depends on FS_DAX
	depends on FS_DAX
	depends on ZONE_DEVICE
	depends on ZONE_DEVICE
	depends on TRANSPARENT_HUGEPAGE
	depends on TRANSPARENT_HUGEPAGE
	depends on BROKEN


endif # BLOCK
endif # BLOCK


+421 −171
Original line number Original line Diff line number Diff line
@@ -32,14 +32,43 @@
#include <linux/pfn_t.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
#include <linux/sizes.h>


#define RADIX_DAX_MASK	0xf
/*
#define RADIX_DAX_SHIFT	4
 * We use lowest available bit in exceptional entry for locking, other two
#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
 * bits to determine entry type. In total 3 special bits.
#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
 */
#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
		RADIX_TREE_EXCEPTIONAL_ENTRY))

/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)

wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];

static int __init init_dax_wait_table(void)
{
	int i;

	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
		init_waitqueue_head(wait_table + i);
	return 0;
}
fs_initcall(init_dax_wait_table);

static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
					      pgoff_t index)
{
	unsigned long hash = hash_long((unsigned long)mapping ^ index,
				       DAX_WAIT_TABLE_BITS);
	return wait_table + hash;
}


static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
{
@@ -262,6 +291,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
}
}
EXPORT_SYMBOL_GPL(dax_do_io);
EXPORT_SYMBOL_GPL(dax_do_io);


/*
 * DAX radix tree locking
 */
struct exceptional_entry_key {
	struct address_space *mapping;
	unsigned long index;
};

struct wait_exceptional_entry_queue {
	wait_queue_t wait;
	struct exceptional_entry_key key;
};

static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
				       int sync, void *keyp)
{
	struct exceptional_entry_key *key = keyp;
	struct wait_exceptional_entry_queue *ewait =
		container_of(wait, struct wait_exceptional_entry_queue, wait);

	if (key->mapping != ewait->key.mapping ||
	    key->index != ewait->key.index)
		return 0;
	return autoremove_wake_function(wait, mode, sync, NULL);
}

/*
 * Check whether the given slot is locked. The function must be called with
 * mapping->tree_lock held
 */
static inline int slot_locked(struct address_space *mapping, void **slot)
{
	unsigned long entry = (unsigned long)
		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
	return entry & RADIX_DAX_ENTRY_LOCK;
}

/*
 * Mark the given slot is locked. The function must be called with
 * mapping->tree_lock held
 */
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
	unsigned long entry = (unsigned long)
		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);

	entry |= RADIX_DAX_ENTRY_LOCK;
	radix_tree_replace_slot(slot, (void *)entry);
	return (void *)entry;
}

/*
 * Mark the given slot is unlocked. The function must be called with
 * mapping->tree_lock held
 */
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
	unsigned long entry = (unsigned long)
		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);

	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
	radix_tree_replace_slot(slot, (void *)entry);
	return (void *)entry;
}

/*
 * Lookup entry in radix tree, wait for it to become unlocked if it is
 * exceptional entry and return it. The caller must call
 * put_unlocked_mapping_entry() when he decided not to lock the entry or
 * put_locked_mapping_entry() when he locked the entry and now wants to
 * unlock it.
 *
 * The function must be called with mapping->tree_lock held.
 */
static void *get_unlocked_mapping_entry(struct address_space *mapping,
					pgoff_t index, void ***slotp)
{
	void *ret, **slot;
	struct wait_exceptional_entry_queue ewait;
	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);

	init_wait(&ewait.wait);
	ewait.wait.func = wake_exceptional_entry_func;
	ewait.key.mapping = mapping;
	ewait.key.index = index;

	for (;;) {
		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
					  &slot);
		if (!ret || !radix_tree_exceptional_entry(ret) ||
		    !slot_locked(mapping, slot)) {
			if (slotp)
				*slotp = slot;
			return ret;
		}
		prepare_to_wait_exclusive(wq, &ewait.wait,
					  TASK_UNINTERRUPTIBLE);
		spin_unlock_irq(&mapping->tree_lock);
		schedule();
		finish_wait(wq, &ewait.wait);
		spin_lock_irq(&mapping->tree_lock);
	}
}

/*
 * Find radix tree entry at given index. If it points to a page, return with
 * the page locked. If it points to the exceptional entry, return with the
 * radix tree entry locked. If the radix tree doesn't contain given index,
 * create empty exceptional entry for the index and return with it locked.
 *
 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 * persistent memory the benefit is doubtful. We can add that later if we can
 * show it helps.
 */
static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
{
	void *ret, **slot;

restart:
	spin_lock_irq(&mapping->tree_lock);
	ret = get_unlocked_mapping_entry(mapping, index, &slot);
	/* No entry for given index? Make sure radix tree is big enough. */
	if (!ret) {
		int err;

		spin_unlock_irq(&mapping->tree_lock);
		err = radix_tree_preload(
				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
		if (err)
			return ERR_PTR(err);
		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
			       RADIX_DAX_ENTRY_LOCK);
		spin_lock_irq(&mapping->tree_lock);
		err = radix_tree_insert(&mapping->page_tree, index, ret);
		radix_tree_preload_end();
		if (err) {
			spin_unlock_irq(&mapping->tree_lock);
			/* Someone already created the entry? */
			if (err == -EEXIST)
				goto restart;
			return ERR_PTR(err);
		}
		/* Good, we have inserted empty locked entry into the tree. */
		mapping->nrexceptional++;
		spin_unlock_irq(&mapping->tree_lock);
		return ret;
	}
	/* Normal page in radix tree? */
	if (!radix_tree_exceptional_entry(ret)) {
		struct page *page = ret;

		get_page(page);
		spin_unlock_irq(&mapping->tree_lock);
		lock_page(page);
		/* Page got truncated? Retry... */
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			put_page(page);
			goto restart;
		}
		return page;
	}
	ret = lock_slot(mapping, slot);
	spin_unlock_irq(&mapping->tree_lock);
	return ret;
}

void dax_wake_mapping_entry_waiter(struct address_space *mapping,
				   pgoff_t index, bool wake_all)
{
	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);

	/*
	 * Checking for locked entry and prepare_to_wait_exclusive() happens
	 * under mapping->tree_lock, ditto for entry handling in our callers.
	 * So at this point all tasks that could have seen our entry locked
	 * must be in the waitqueue and the following check will see them.
	 */
	if (waitqueue_active(wq)) {
		struct exceptional_entry_key key;

		key.mapping = mapping;
		key.index = index;
		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
	}
}

void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
{
	void *ret, **slot;

	spin_lock_irq(&mapping->tree_lock);
	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
			 !slot_locked(mapping, slot))) {
		spin_unlock_irq(&mapping->tree_lock);
		return;
	}
	unlock_slot(mapping, slot);
	spin_unlock_irq(&mapping->tree_lock);
	dax_wake_mapping_entry_waiter(mapping, index, false);
}

static void put_locked_mapping_entry(struct address_space *mapping,
				     pgoff_t index, void *entry)
{
	if (!radix_tree_exceptional_entry(entry)) {
		unlock_page(entry);
		put_page(entry);
	} else {
		dax_unlock_mapping_entry(mapping, index);
	}
}

/*
 * Called when we are done with radix tree entry we looked up via
 * get_unlocked_mapping_entry() and which we didn't lock in the end.
 */
static void put_unlocked_mapping_entry(struct address_space *mapping,
				       pgoff_t index, void *entry)
{
	if (!radix_tree_exceptional_entry(entry))
		return;

	/* We have to wake up next waiter for the radix tree entry lock */
	dax_wake_mapping_entry_waiter(mapping, index, false);
}

/*
 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 * entry to get unlocked before deleting it.
 */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
	void *entry;

	spin_lock_irq(&mapping->tree_lock);
	entry = get_unlocked_mapping_entry(mapping, index, NULL);
	/*
	 * This gets called from truncate / punch_hole path. As such, the caller
	 * must hold locks protecting against concurrent modifications of the
	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
	 * caller has seen exceptional entry for this index, we better find it
	 * at that index as well...
	 */
	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
		spin_unlock_irq(&mapping->tree_lock);
		return 0;
	}
	radix_tree_delete(&mapping->page_tree, index);
	mapping->nrexceptional--;
	spin_unlock_irq(&mapping->tree_lock);
	dax_wake_mapping_entry_waiter(mapping, index, true);

	return 1;
}

/*
/*
 * The user has performed a load from a hole in the file.  Allocating
 * The user has performed a load from a hole in the file.  Allocating
 * a new page in the file would cause excessive storage usage for
 * a new page in the file would cause excessive storage usage for
@@ -270,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
 * otherwise it will simply fall out of the page cache under memory
 * otherwise it will simply fall out of the page cache under memory
 * pressure without ever having been dirtied.
 * pressure without ever having been dirtied.
 */
 */
static int dax_load_hole(struct address_space *mapping, struct page *page,
static int dax_load_hole(struct address_space *mapping, void *entry,
			 struct vm_fault *vmf)
			 struct vm_fault *vmf)
{
{
	if (!page)
	struct page *page;

	/* Hole page already exists? Return it...  */
	if (!radix_tree_exceptional_entry(entry)) {
		vmf->page = entry;
		return VM_FAULT_LOCKED;
	}

	/* This will replace locked radix tree entry with a hole page */
	page = find_or_create_page(mapping, vmf->pgoff,
	page = find_or_create_page(mapping, vmf->pgoff,
						GFP_KERNEL | __GFP_ZERO);
				   vmf->gfp_mask | __GFP_ZERO);
	if (!page)
	if (!page) {
		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
		return VM_FAULT_OOM;
		return VM_FAULT_OOM;

	}
	vmf->page = page;
	vmf->page = page;
	return VM_FAULT_LOCKED;
	return VM_FAULT_LOCKED;
}
}
@@ -302,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
	return 0;
	return 0;
}
}


#define NO_SECTOR -1
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))


static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
static void *dax_insert_mapping_entry(struct address_space *mapping,
		sector_t sector, bool pmd_entry, bool dirty)
				      struct vm_fault *vmf,
				      void *entry, sector_t sector)
{
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
	struct radix_tree_root *page_tree = &mapping->page_tree;
	pgoff_t pmd_index = DAX_PMD_INDEX(index);
	int error = 0;
	int type, error = 0;
	bool hole_fill = false;
	void *entry;
	void *new_entry;
	pgoff_t index = vmf->pgoff;


	WARN_ON_ONCE(pmd_entry && !dirty);
	if (vmf->flags & FAULT_FLAG_WRITE)
	if (dirty)
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);


	spin_lock_irq(&mapping->tree_lock);
	/* Replacing hole page with block mapping? */

	if (!radix_tree_exceptional_entry(entry)) {
	entry = radix_tree_lookup(page_tree, pmd_index);
		hole_fill = true;
	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
		index = pmd_index;
		goto dirty;
	}

	entry = radix_tree_lookup(page_tree, index);
	if (entry) {
		type = RADIX_DAX_TYPE(entry);
		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
					type != RADIX_DAX_PMD)) {
			error = -EIO;
			goto unlock;
		}

		if (!pmd_entry || type == RADIX_DAX_PMD)
			goto dirty;

		/*
		/*
		 * We only insert dirty PMD entries into the radix tree.  This
		 * Unmap the page now before we remove it from page cache below.
		 * means we don't need to worry about removing a dirty PTE
		 * The page is locked so it cannot be faulted in again.
		 * entry and inserting a clean PMD entry, thus reducing the
		 * range we would flush with a follow-up fsync/msync call.
		 */
		 */
		radix_tree_delete(&mapping->page_tree, index);
		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
		mapping->nrexceptional--;
				    PAGE_SIZE, 0);
		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
		if (error)
			return ERR_PTR(error);
	}
	}


	if (sector == NO_SECTOR) {
	spin_lock_irq(&mapping->tree_lock);
		/*
	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
		 * This can happen during correct operation if our pfn_mkwrite
		       RADIX_DAX_ENTRY_LOCK);
		 * fault raced against a hole punch operation.  If this
	if (hole_fill) {
		 * happens the pte that was hole punched will have been
		__delete_from_page_cache(entry, NULL);
		 * unmapped and the radix tree entry will have been removed by
		/* Drop pagecache reference */
		 * the time we are called, but the call will still happen.  We
		put_page(entry);
		 * will return all the way up to wp_pfn_shared(), where the
		error = radix_tree_insert(page_tree, index, new_entry);
		 * pte_same() check will fail, eventually causing page fault
		if (error) {
		 * to be retried by the CPU.
			new_entry = ERR_PTR(error);
		 */
			goto unlock;
			goto unlock;
		}
		}

	error = radix_tree_insert(page_tree, index,
			RADIX_DAX_ENTRY(sector, pmd_entry));
	if (error)
		goto unlock;

		mapping->nrexceptional++;
		mapping->nrexceptional++;
 dirty:
	} else {
	if (dirty)
		void **slot;
		void *ret;

		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
		WARN_ON_ONCE(ret != entry);
		radix_tree_replace_slot(slot, new_entry);
	}
	if (vmf->flags & FAULT_FLAG_WRITE)
		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 unlock:
 unlock:
	spin_unlock_irq(&mapping->tree_lock);
	spin_unlock_irq(&mapping->tree_lock);
	return error;
	if (hole_fill) {
		radix_tree_preload_end();
		/*
		 * We don't need hole page anymore, it has been replaced with
		 * locked radix tree entry now.
		 */
		if (mapping->a_ops->freepage)
			mapping->a_ops->freepage(entry);
		unlock_page(entry);
		put_page(entry);
	}
	return new_entry;
}
}


static int dax_writeback_one(struct block_device *bdev,
static int dax_writeback_one(struct block_device *bdev,
@@ -498,37 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);


static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
static int dax_insert_mapping(struct address_space *mapping,
			struct buffer_head *bh, void **entryp,
			struct vm_area_struct *vma, struct vm_fault *vmf)
			struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	struct address_space *mapping = inode->i_mapping;
	struct block_device *bdev = bh->b_bdev;
	struct block_device *bdev = bh->b_bdev;
	struct blk_dax_ctl dax = {
	struct blk_dax_ctl dax = {
		.sector = to_sector(bh, inode),
		.sector = to_sector(bh, mapping->host),
		.size = bh->b_size,
		.size = bh->b_size,
	};
	};
	int error;
	void *ret;
	void *entry = *entryp;


	i_mmap_lock_read(mapping);
	if (dax_map_atomic(bdev, &dax) < 0)

		return PTR_ERR(dax.addr);
	if (dax_map_atomic(bdev, &dax) < 0) {
		error = PTR_ERR(dax.addr);
		goto out;
	}
	dax_unmap_atomic(bdev, &dax);
	dax_unmap_atomic(bdev, &dax);


	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
			vmf->flags & FAULT_FLAG_WRITE);
	if (IS_ERR(ret))
	if (error)
		return PTR_ERR(ret);
		goto out;
	*entryp = ret;

	error = vm_insert_mixed(vma, vaddr, dax.pfn);

 out:
	i_mmap_unlock_read(mapping);


	return error;
	return vm_insert_mixed(vma, vaddr, dax.pfn);
}
}


/**
/**
@@ -547,7 +829,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	struct file *file = vma->vm_file;
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	struct inode *inode = mapping->host;
	struct page *page;
	void *entry;
	struct buffer_head bh;
	struct buffer_head bh;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	unsigned blkbits = inode->i_blkbits;
	unsigned blkbits = inode->i_blkbits;
@@ -556,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	int error;
	int error;
	int major = 0;
	int major = 0;


	/*
	 * Check whether offset isn't beyond end of file now. Caller is supposed
	 * to hold locks serializing us with truncate / punch hole so this is
	 * a reliable test.
	 */
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (vmf->pgoff >= size)
	if (vmf->pgoff >= size)
		return VM_FAULT_SIGBUS;
		return VM_FAULT_SIGBUS;
@@ -565,40 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	bh.b_bdev = inode->i_sb->s_bdev;
	bh.b_bdev = inode->i_sb->s_bdev;
	bh.b_size = PAGE_SIZE;
	bh.b_size = PAGE_SIZE;


 repeat:
	entry = grab_mapping_entry(mapping, vmf->pgoff);
	page = find_get_page(mapping, vmf->pgoff);
	if (IS_ERR(entry)) {
	if (page) {
		error = PTR_ERR(entry);
		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
		goto out;
			put_page(page);
			return VM_FAULT_RETRY;
		}
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			put_page(page);
			goto repeat;
		}
	}
	}


	error = get_block(inode, block, &bh, 0);
	error = get_block(inode, block, &bh, 0);
	if (!error && (bh.b_size < PAGE_SIZE))
	if (!error && (bh.b_size < PAGE_SIZE))
		error = -EIO;		/* fs corruption? */
		error = -EIO;		/* fs corruption? */
	if (error)
	if (error)
		goto unlock_page;
		goto unlock_entry;

	if (!buffer_mapped(&bh) && !vmf->cow_page) {
		if (vmf->flags & FAULT_FLAG_WRITE) {
			error = get_block(inode, block, &bh, 1);
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			major = VM_FAULT_MAJOR;
			if (!error && (bh.b_size < PAGE_SIZE))
				error = -EIO;
			if (error)
				goto unlock_page;
		} else {
			return dax_load_hole(mapping, page, vmf);
		}
	}


	if (vmf->cow_page) {
	if (vmf->cow_page) {
		struct page *new_page = vmf->cow_page;
		struct page *new_page = vmf->cow_page;
@@ -607,30 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		else
		else
			clear_user_highpage(new_page, vaddr);
			clear_user_highpage(new_page, vaddr);
		if (error)
		if (error)
			goto unlock_page;
			goto unlock_entry;
		vmf->page = page;
		if (!radix_tree_exceptional_entry(entry)) {
		if (!page)
			vmf->page = entry;
			i_mmap_lock_read(mapping);
			return VM_FAULT_LOCKED;
			return VM_FAULT_LOCKED;
		}
		}
		vmf->entry = entry;
		return VM_FAULT_DAX_LOCKED;
	}


	/* Check we didn't race with a read fault installing a new page */
	if (!buffer_mapped(&bh)) {
	if (!page && major)
		if (vmf->flags & FAULT_FLAG_WRITE) {
		page = find_lock_page(mapping, vmf->pgoff);
			error = get_block(inode, block, &bh, 1);

			count_vm_event(PGMAJFAULT);
	if (page) {
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
			major = VM_FAULT_MAJOR;
							PAGE_SIZE, 0);
			if (!error && (bh.b_size < PAGE_SIZE))
		delete_from_page_cache(page);
				error = -EIO;
		unlock_page(page);
			if (error)
		put_page(page);
				goto unlock_entry;
		page = NULL;
		} else {
			return dax_load_hole(mapping, entry, vmf);
		}
	}
	}


	/* Filesystem should not return unwritten buffers to us! */
	/* Filesystem should not return unwritten buffers to us! */
	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
	error = dax_insert_mapping(inode, &bh, vma, vmf);
	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);

 unlock_entry:
	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 out:
 out:
	if (error == -ENOMEM)
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
		return VM_FAULT_OOM | major;
@@ -638,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	if ((error < 0) && (error != -EBUSY))
	if ((error < 0) && (error != -EBUSY))
		return VM_FAULT_SIGBUS | major;
		return VM_FAULT_SIGBUS | major;
	return VM_FAULT_NOPAGE | major;
	return VM_FAULT_NOPAGE | major;

 unlock_page:
	if (page) {
		unlock_page(page);
		put_page(page);
	}
	goto out;
}
}
EXPORT_SYMBOL(__dax_fault);
EXPORT_SYMBOL(__dax_fault);


@@ -675,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
}
EXPORT_SYMBOL_GPL(dax_fault);
EXPORT_SYMBOL_GPL(dax_fault);


#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
/*
 * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
 * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
 * more often than one might expect in the below function.
 * more often than one might expect in the below function.
@@ -713,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	struct block_device *bdev;
	struct block_device *bdev;
	pgoff_t size, pgoff;
	pgoff_t size, pgoff;
	sector_t block;
	sector_t block;
	int error, result = 0;
	int result = 0;
	bool alloc = false;
	bool alloc = false;


	/* dax pmd mappings require pfn_t_devmap() */
	/* dax pmd mappings require pfn_t_devmap() */
@@ -786,9 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		truncate_pagecache_range(inode, lstart, lend);
		truncate_pagecache_range(inode, lstart, lend);
	}
	}


	i_mmap_lock_read(mapping);
	if (!write && !buffer_mapped(&bh)) {

	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
		spinlock_t *ptl;
		spinlock_t *ptl;
		pmd_t entry;
		pmd_t entry;
		struct page *zero_page = get_huge_zero_page();
		struct page *zero_page = get_huge_zero_page();
@@ -860,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		 * the write to insert a dirty entry.
		 * the write to insert a dirty entry.
		 */
		 */
		if (write) {
		if (write) {
			error = dax_radix_entry(mapping, pgoff, dax.sector,
			/*
					true, true);
			 * We should insert radix-tree entry and dirty it here.
			if (error) {
			 * For now this is broken...
				dax_pmd_dbg(&bh, address,
			 */
						"PMD radix insertion failed");
				goto fallback;
			}
		}
		}


		dev_dbg(part_to_dev(bdev->bd_part),
		dev_dbg(part_to_dev(bdev->bd_part),
@@ -879,8 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	}
	}


 out:
 out:
	i_mmap_unlock_read(mapping);

	return result;
	return result;


 fallback:
 fallback:
@@ -926,23 +1181,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	struct file *file = vma->vm_file;
	struct file *file = vma->vm_file;
	int error;
	struct address_space *mapping = file->f_mapping;

	void *entry;
	/*
	pgoff_t index = vmf->pgoff;
	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
	 * RADIX_DAX_PTE entry already exists in the radix tree from a
	 * previous call to __dax_fault().  We just want to look up that PTE
	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
	 * saves us from having to make a call to get_block() here to look
	 * up the sector.
	 */
	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
			true);


	if (error == -ENOMEM)
	spin_lock_irq(&mapping->tree_lock);
		return VM_FAULT_OOM;
	entry = get_unlocked_mapping_entry(mapping, index, NULL);
	if (error)
	if (!entry || !radix_tree_exceptional_entry(entry))
		return VM_FAULT_SIGBUS;
		goto out;
	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
	put_unlocked_mapping_entry(mapping, index, entry);
out:
	spin_unlock_irq(&mapping->tree_lock);
	return VM_FAULT_NOPAGE;
	return VM_FAULT_NOPAGE;
}
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+15 −1
Original line number Original line Diff line number Diff line
@@ -3,17 +3,25 @@


#include <linux/fs.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>
#include <asm/pgtable.h>
#include <asm/pgtable.h>


/* We use lowest available exceptional entry bit for locking */
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)

ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
		  get_block_t, dio_iodone_t, int flags);
		  get_block_t, dio_iodone_t, int flags);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
				   pgoff_t index, bool wake_all);


#ifdef CONFIG_FS_DAX
#ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
		unsigned int offset, unsigned int length);
		unsigned int offset, unsigned int length);
#else
#else
@@ -22,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
{
{
	return ERR_PTR(-ENXIO);
	return ERR_PTR(-ENXIO);
}
}
/* Shouldn't ever be called when dax is disabled. */
static inline void dax_unlock_mapping_entry(struct address_space *mapping,
					    pgoff_t index)
{
	BUG();
}
static inline int __dax_zero_page_range(struct block_device *bdev,
static inline int __dax_zero_page_range(struct block_device *bdev,
		sector_t sector, unsigned int offset, unsigned int length)
		sector_t sector, unsigned int offset, unsigned int length)
{
{
@@ -29,7 +43,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
}
}
#endif
#endif


#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
				unsigned int flags, get_block_t);
				unsigned int flags, get_block_t);
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+7 −0
Original line number Original line Diff line number Diff line
@@ -303,6 +303,12 @@ struct vm_fault {
					 * is set (which is also implied by
					 * is set (which is also implied by
					 * VM_FAULT_ERROR).
					 * VM_FAULT_ERROR).
					 */
					 */
	void *entry;			/* ->fault handler can alternatively
					 * return locked DAX entry. In that
					 * case handler should return
					 * VM_FAULT_DAX_LOCKED and fill in
					 * entry here.
					 */
	/* for ->map_pages() only */
	/* for ->map_pages() only */
	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
					 * max_pgoff inclusive */
					 * max_pgoff inclusive */
@@ -1076,6 +1082,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
#define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
#define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
#define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
#define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
#define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
#define VM_FAULT_DAX_LOCKED 0x1000	/* ->fault has locked DAX entry */


#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */


+21 −9
Original line number Original line Diff line number Diff line
@@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
			return;
			return;


	/*
	/*
	 * Track node that only contains shadow entries.
	 * Track node that only contains shadow entries. DAX mappings contain
	 * no shadow entries and may contain other exceptional entries so skip
	 * those.
	 *
	 *
	 * Avoid acquiring the list_lru lock if already tracked.  The
	 * Avoid acquiring the list_lru lock if already tracked.  The
	 * list_empty() test is safe as node->private_list is
	 * list_empty() test is safe as node->private_list is
	 * protected by mapping->tree_lock.
	 * protected by mapping->tree_lock.
	 */
	 */
	if (!workingset_node_pages(node) &&
	if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
	    list_empty(&node->private_list)) {
	    list_empty(&node->private_list)) {
		node->private_data = mapping;
		node->private_data = mapping;
		list_lru_add(&workingset_shadow_nodes, &node->private_list);
		list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping,
		if (!radix_tree_exceptional_entry(p))
		if (!radix_tree_exceptional_entry(p))
			return -EEXIST;
			return -EEXIST;


		if (WARN_ON(dax_mapping(mapping)))
		mapping->nrexceptional--;
			return -EINVAL;
		if (!dax_mapping(mapping)) {

			if (shadowp)
			if (shadowp)
				*shadowp = p;
				*shadowp = p;
		mapping->nrexceptional--;
			if (node)
			if (node)
				workingset_node_shadows_dec(node);
				workingset_node_shadows_dec(node);
		} else {
			/* DAX can replace empty locked entry with a hole */
			WARN_ON_ONCE(p !=
				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
					 RADIX_DAX_ENTRY_LOCK));
			/* DAX accounts exceptional entries as normal pages */
			if (node)
				workingset_node_pages_dec(node);
			/* Wakeup waiters for exceptional entry lock */
			dax_wake_mapping_entry_waiter(mapping, page->index,
						      false);
		}
	}
	}
	radix_tree_replace_slot(slot, page);
	radix_tree_replace_slot(slot, page);
	mapping->nrpages++;
	mapping->nrpages++;
Loading