Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a2d58167 authored by Dave Jiang's avatar Dave Jiang Committed by Linus Torvalds
Browse files

mm,fs,dax: change ->pmd_fault to ->huge_fault

Patch series "1G transparent hugepage support for device dax", v2.

The following series implements support for 1G trasparent hugepage on
x86 for device dax.  The bulk of the code was written by Mathew Wilcox a
while back supporting transparent 1G hugepage for fs DAX.  I have
forward ported the relevant bits to 4.10-rc.  The current submission has
only the necessary code to support device DAX.

Comments from Dan Williams: So the motivation and intended user of this
functionality mirrors the motivation and users of 1GB page support in
hugetlbfs.  Given expected capacities of persistent memory devices an
in-memory database may want to reduce tlb pressure beyond what they can
already achieve with 2MB mappings of a device-dax file.  We have
customer feedback to that effect as Willy mentioned in his previous
version of these patches [1].

[1]: https://lkml.org/lkml/2016/1/31/52

Comments from Nilesh @ Oracle:

There are applications which have a process model; and if you assume
10,000 processes attempting to mmap all the 6TB memory available on a
server; we are looking at the following:

processes         : 10,000
memory            :    6TB
pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB
pmd @ 2M page size: 120,000 / 512 = ~240GB
pud @ 1G page size: 240GB / 512 = ~480MB

As you can see with 2M pages, this system will use up an exorbitant
amount of DRAM to hold the page tables; but the 1G pages finally brings
it down to a reasonable level.  Memory sizes will keep increasing; so
this number will keep increasing.

An argument can be made to convert the applications from process model
to thread model, but in the real world that may not be always practical.
Hopefully this helps explain the use case where this is valuable.

This patch (of 3):

In preparation for adding the ability to handle PUD pages, convert
vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault.  The
vm_fault structure is extended to include a union of the different page
table pointers that may be needed, and three flag bits are reserved to
indicate which type of pointer is in the union.

[ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()]
  Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com
[dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path]
  Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com


Signed-off-by: default avatarMatthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: default avatarDave Jiang <dave.jiang@intel.com>
Signed-off-by: default avatarRoss Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent bd233f53
Loading
Loading
Loading
Loading
+13 −21
Original line number Original line Diff line number Diff line
@@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
	return -1;
	return -1;
}
}


static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
{
{
	struct device *dev = &dax_dev->dev;
	struct device *dev = &dax_dev->dev;
	struct dax_region *dax_region;
	struct dax_region *dax_region;
@@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
	return VM_FAULT_NOPAGE;
	return VM_FAULT_NOPAGE;
}
}


static int dax_dev_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	int rc;
	struct file *filp = vma->vm_file;
	struct dax_dev *dax_dev = filp->private_data;

	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
			? "write" : "read", vma->vm_start, vma->vm_end);
	rcu_read_lock();
	rc = __dax_dev_fault(dax_dev, vmf);
	rcu_read_unlock();

	return rc;
}

static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
{
{
	unsigned long pmd_addr = vmf->address & PMD_MASK;
	unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
			vmf->flags & FAULT_FLAG_WRITE);
			vmf->flags & FAULT_FLAG_WRITE);
}
}


static int dax_dev_pmd_fault(struct vm_fault *vmf)
static int dax_dev_fault(struct vm_fault *vmf)
{
{
	int rc;
	int rc;
	struct file *filp = vmf->vma->vm_file;
	struct file *filp = vmf->vma->vm_file;
@@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
			vmf->vma->vm_start, vmf->vma->vm_end);
			vmf->vma->vm_start, vmf->vma->vm_end);


	rcu_read_lock();
	rcu_read_lock();
	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
	case FAULT_FLAG_SIZE_PTE:
		rc = __dax_dev_pte_fault(dax_dev, vmf);
		break;
	case FAULT_FLAG_SIZE_PMD:
		rc = __dax_dev_pmd_fault(dax_dev, vmf);
		rc = __dax_dev_pmd_fault(dax_dev, vmf);
		break;
	default:
		return VM_FAULT_FALLBACK;
	}
	rcu_read_unlock();
	rcu_read_unlock();


	return rc;
	return rc;
@@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)


static const struct vm_operations_struct dax_dev_vm_ops = {
static const struct vm_operations_struct dax_dev_vm_ops = {
	.fault = dax_dev_fault,
	.fault = dax_dev_fault,
	.pmd_fault = dax_dev_pmd_fault,
	.huge_fault = dax_dev_fault,
};
};


static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
+32 −13
Original line number Original line Diff line number Diff line
@@ -1118,16 +1118,8 @@ static int dax_fault_return(int error)
	return VM_FAULT_SIGBUS;
	return VM_FAULT_SIGBUS;
}
}


/**
static int dax_iomap_pte_fault(struct vm_fault *vmf,
 * dax_iomap_fault - handle a page fault on a DAX file
			       const struct iomap_ops *ops)
 * @vmf: The description of the fault
 * @ops: iomap ops passed from the file system
 *
 * When a page fault occurs, filesystems may call this helper in their fault
 * or mkwrite handler for DAX files. Assumes the caller has done all the
 * necessary locking for the page fault to proceed successfully.
 */
int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
{
{
	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
	struct inode *inode = mapping->host;
	struct inode *inode = mapping->host;
@@ -1244,7 +1236,6 @@ int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
	}
	}
	return vmf_ret;
	return vmf_ret;
}
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);


#ifdef CONFIG_FS_DAX_PMD
#ifdef CONFIG_FS_DAX_PMD
/*
/*
@@ -1335,7 +1326,8 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
	return VM_FAULT_FALLBACK;
	return VM_FAULT_FALLBACK;
}
}


int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
static int dax_iomap_pmd_fault(struct vm_fault *vmf,
			       const struct iomap_ops *ops)
{
{
	struct vm_area_struct *vma = vmf->vma;
	struct vm_area_struct *vma = vmf->vma;
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1443,5 +1435,32 @@ int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
	return result;
	return result;
}
}
EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
#else
static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
{
	return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */
#endif /* CONFIG_FS_DAX_PMD */

/**
 * dax_iomap_fault - handle a page fault on a DAX file
 * @vmf: The description of the fault
 * @ops: iomap ops passed from the file system
 *
 * When a page fault occurs, filesystems may call this helper in
 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
 * has done all the necessary locking for page fault to proceed
 * successfully.
 */
int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
{
	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
	case FAULT_FLAG_SIZE_PTE:
		return dax_iomap_pte_fault(vmf, ops);
	case FAULT_FLAG_SIZE_PMD:
		return dax_iomap_pmd_fault(vmf, ops);
	default:
		return VM_FAULT_FALLBACK;
	}
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
+1 −1
Original line number Original line Diff line number Diff line
@@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
static const struct vm_operations_struct ext2_dax_vm_ops = {
static const struct vm_operations_struct ext2_dax_vm_ops = {
	.fault		= ext2_dax_fault,
	.fault		= ext2_dax_fault,
	/*
	/*
	 * .pmd_fault is not supported for DAX because allocation in ext2
	 * .huge_fault is not supported for DAX because allocation in ext2
	 * cannot be reliably aligned to huge page sizes and so pmd faults
	 * cannot be reliably aligned to huge page sizes and so pmd faults
	 * will always fail and fail back to regular faults.
	 * will always fail and fail back to regular faults.
	 */
	 */
+1 −22
Original line number Original line Diff line number Diff line
@@ -273,27 +273,6 @@ static int ext4_dax_fault(struct vm_fault *vmf)
	return result;
	return result;
}
}


static int
ext4_dax_pmd_fault(struct vm_fault *vmf)
{
	int result;
	struct inode *inode = file_inode(vmf->vma->vm_file);
	struct super_block *sb = inode->i_sb;
	bool write = vmf->flags & FAULT_FLAG_WRITE;

	if (write) {
		sb_start_pagefault(sb);
		file_update_time(vmf->vma->vm_file);
	}
	down_read(&EXT4_I(inode)->i_mmap_sem);
	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
	up_read(&EXT4_I(inode)->i_mmap_sem);
	if (write)
		sb_end_pagefault(sb);

	return result;
}

/*
/*
 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
 * handler we check for races agaist truncate. Note that since we cycle through
 * handler we check for races agaist truncate. Note that since we cycle through
@@ -326,7 +305,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)


static const struct vm_operations_struct ext4_dax_vm_ops = {
static const struct vm_operations_struct ext4_dax_vm_ops = {
	.fault		= ext4_dax_fault,
	.fault		= ext4_dax_fault,
	.pmd_fault	= ext4_dax_pmd_fault,
	.huge_fault	= ext4_dax_fault,
	.page_mkwrite	= ext4_dax_fault,
	.page_mkwrite	= ext4_dax_fault,
	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
};
};
+5 −5
Original line number Original line Diff line number Diff line
@@ -1429,12 +1429,12 @@ xfs_filemap_fault(
/*
/*
 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
 * both read and write faults. Hence we need to handle both cases. There is no
 * both read and write faults. Hence we need to handle both cases. There is no
 * ->pmd_mkwrite callout for huge pages, so we have a single function here to
 * ->huge_mkwrite callout for huge pages, so we have a single function here to
 * handle both cases here. @flags carries the information on the type of fault
 * handle both cases here. @flags carries the information on the type of fault
 * occuring.
 * occuring.
 */
 */
STATIC int
STATIC int
xfs_filemap_pmd_fault(
xfs_filemap_huge_fault(
	struct vm_fault		*vmf)
	struct vm_fault		*vmf)
{
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	struct inode		*inode = file_inode(vmf->vma->vm_file);
@@ -1444,7 +1444,7 @@ xfs_filemap_pmd_fault(
	if (!IS_DAX(inode))
	if (!IS_DAX(inode))
		return VM_FAULT_FALLBACK;
		return VM_FAULT_FALLBACK;


	trace_xfs_filemap_pmd_fault(ip);
	trace_xfs_filemap_huge_fault(ip);


	if (vmf->flags & FAULT_FLAG_WRITE) {
	if (vmf->flags & FAULT_FLAG_WRITE) {
		sb_start_pagefault(inode->i_sb);
		sb_start_pagefault(inode->i_sb);
@@ -1452,7 +1452,7 @@ xfs_filemap_pmd_fault(
	}
	}


	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);


	if (vmf->flags & FAULT_FLAG_WRITE)
	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -1497,7 +1497,7 @@ xfs_filemap_pfn_mkwrite(


static const struct vm_operations_struct xfs_file_vm_ops = {
static const struct vm_operations_struct xfs_file_vm_ops = {
	.fault		= xfs_filemap_fault,
	.fault		= xfs_filemap_fault,
	.pmd_fault	= xfs_filemap_pmd_fault,
	.huge_fault	= xfs_filemap_huge_fault,
	.map_pages	= filemap_map_pages,
	.map_pages	= filemap_map_pages,
	.page_mkwrite	= xfs_filemap_page_mkwrite,
	.page_mkwrite	= xfs_filemap_page_mkwrite,
	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
Loading