Loading fs/dax.c +240 −12 Original line number Diff line number Diff line Loading @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/iomap.h> #include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two Loading Loading @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, return VM_FAULT_LOCKED; } static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, .sector = sector, .size = size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) Loading Loading @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, mapping->host), .size = bh->b_size, .sector = sector, .size = size, }; void *ret; void *entry = *entryp; Loading Loading @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), bh.b_size, new_page, vaddr); else clear_user_highpage(new_page, vaddr); if (error) Loading Loading @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), bh.b_size, &entry, vma, vmf); unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: Loading Loading @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); #ifdef CONFIG_FS_IOMAP static loff_t iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; if (iov_iter_rw(iter) == READ) { end = min(end, i_size_read(inode)); if (pos >= end) return 0; if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) return iov_iter_zero(min(length, end - pos), iter); } if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) return -EIO; while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); struct blk_dax_ctl dax = { 0 }; ssize_t map_len; dax.sector = iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; map_len = dax_map_atomic(iomap->bdev, &dax); if (map_len < 0) { ret = map_len; break; } dax.addr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (iov_iter_rw(iter) == WRITE) map_len = copy_from_iter_pmem(dax.addr, map_len, iter); else map_len = copy_to_iter(dax.addr, map_len, iter); dax_unmap_atomic(iomap->bdev, &dax); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; } pos += map_len; length -= map_len; done += map_len; } return done ? done : ret; } /** * iomap_dax_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O. */ ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos, ret = 0, done = 0; unsigned flags = 0; if (iov_iter_rw(iter) == WRITE) flags |= IOMAP_WRITE; /* * Yes, even DAX files can have page cache attached to them: A zeroed * page is inserted into the pagecache when we have to serve a write * fault on a hole. It should never be dirtied and can simply be * dropped from the pagecache once we get real data for the page. * * XXX: This is racy against mmap, and there's nothing we can do about * it. We'll eventually need to shift this down even further so that * we can check if we allocated blocks over a hole first. */ if (mapping->nrpages) { ret = invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); WARN_ON_ONCE(ret); } while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, iter, iomap_dax_actor); if (ret <= 0) break; pos += ret; done += ret; } iocb->ki_pos += done; return done ? done : ret; } EXPORT_SYMBOL_GPL(iomap_dax_rw); /** * iomap_dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @ops: iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in their fault * or mkwrite handler for DAX files. Assumes the caller has done all the * necessary locking for the page fault to proceed successfully. */ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = (unsigned long)vmf->virtual_address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; unsigned flags = 0; int error, major = 0; void *entry; /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (pos >= i_size_read(inode)) return VM_FAULT_SIGBUS; entry = grab_mapping_entry(mapping, vmf->pgoff); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto out; } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); if (error) goto unlock_entry; if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { error = -EIO; /* fs corruption? */ goto unlock_entry; } sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); if (vmf->cow_page) { switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, vmf->cow_page, vaddr); break; default: WARN_ON_ONCE(1); error = -EIO; break; } if (error) goto unlock_entry; if (!radix_tree_exceptional_entry(entry)) { vmf->page = entry; return VM_FAULT_LOCKED; } vmf->entry = entry; return VM_FAULT_DAX_LOCKED; } switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, sector, PAGE_SIZE, &entry, vma, vmf); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) return dax_load_hole(mapping, entry, vmf); /*FALLTHRU*/ default: WARN_ON_ONCE(1); error = -EIO; break; } unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; } EXPORT_SYMBOL_GPL(iomap_dax_fault); #endif /* CONFIG_FS_IOMAP */ fs/ext2/Kconfig +1 −0 Original line number Diff line number Diff line config EXT2_FS tristate "Second extended fs support" select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. Loading fs/ext2/ext2.h +1 −0 Original line number Diff line number Diff line Loading @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; extern struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; Loading fs/ext2/file.c +69 −7 Original line number Diff line number Diff line Loading @@ -22,11 +22,59 @@ #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/iomap.h> #include <linux/uio.h> #include "ext2.h" #include "xattr.h" #include "acl.h" #ifdef CONFIG_FS_DAX static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; ssize_t ret; if (!iov_iter_count(to)) return 0; /* skip atime */ inode_lock_shared(inode); ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out_unlock; ret = file_remove_privs(file); if (ret) goto out_unlock; ret = file_update_time(file); if (ret) goto out_unlock; ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); } out_unlock: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } /* * The lock ordering for ext2 DAX fault paths is: * Loading @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); ret = dax_fault(vma, vmf, ext2_get_block); ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) Loading Loading @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } /* * We have mostly NULL's here: the current defaults are ok for * the ext2 filesystem. */ static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { #ifdef CONFIG_FS_DAX if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_read_iter(iocb, to); #endif return generic_file_read_iter(iocb, to); } static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { #ifdef CONFIG_FS_DAX if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_write_iter(iocb, from); #endif return generic_file_write_iter(iocb, from); } const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .read_iter = ext2_file_read_iter, .write_iter = ext2_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, Loading fs/ext2/inode.c +81 −19 Original line number Diff line number Diff line Loading @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> #include "ext2.h" Loading Loading @@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode, */ static int ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, u32 *bno, bool *new, bool *boundary, int create) { int err = -EIO; Loading @@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); /* What's this do? */ count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { Loading Loading @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; clear_buffer_new(bh_result); goto got_it; } } Loading Loading @@ -745,15 +744,16 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } } else set_buffer_new(bh_result); } else { *new = true; } ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) set_buffer_boundary(bh_result); *boundary = true; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ Loading @@ -765,19 +765,82 @@ static int ext2_get_blocks(struct inode *inode, return err; } int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int ret = ext2_get_blocks(inode, iblock, max_blocks, bh_result, create); if (ret > 0) { bool new = false, boundary = false; u32 bno; int ret; ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, create); if (ret <= 0) return ret; map_bh(bh_result, inode->i_sb, bno); bh_result->b_size = (ret << inode->i_blkbits); ret = 0; if (new) set_buffer_new(bh_result); if (boundary) set_buffer_boundary(bh_result); return 0; } #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; bool new = false, boundary = false; u32 bno; int ret; ret = ext2_get_blocks(inode, first_block, max_blocks, &bno, &new, &boundary, flags & IOMAP_WRITE); if (ret < 0) return ret; iomap->flags = 0; iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; if (ret == 0) { iomap->type = IOMAP_HOLE; iomap->blkno = IOMAP_NULL_BLOCK; iomap->length = 1 << blkbits; } else { iomap->type = IOMAP_MAPPED; iomap->blkno = (sector_t)bno << (blkbits - 9); iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } if (new) iomap->flags |= IOMAP_F_NEW; return 0; } static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) ext2_write_failed(inode->i_mapping, offset + length); return 0; } struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { Loading Loading @@ -863,10 +926,9 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, DIO_LOCKING); else if (WARN_ON_ONCE(IS_DAX(inode))) return -EIO; ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); Loading Loading
fs/dax.c +240 −12 Original line number Diff line number Diff line Loading @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/iomap.h> #include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two Loading Loading @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, return VM_FAULT_LOCKED; } static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, .sector = sector, .size = size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) Loading Loading @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, mapping->host), .size = bh->b_size, .sector = sector, .size = size, }; void *ret; void *entry = *entryp; Loading Loading @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), bh.b_size, new_page, vaddr); else clear_user_highpage(new_page, vaddr); if (error) Loading Loading @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), bh.b_size, &entry, vma, vmf); unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: Loading Loading @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); #ifdef CONFIG_FS_IOMAP static loff_t iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; if (iov_iter_rw(iter) == READ) { end = min(end, i_size_read(inode)); if (pos >= end) return 0; if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) return iov_iter_zero(min(length, end - pos), iter); } if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) return -EIO; while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); struct blk_dax_ctl dax = { 0 }; ssize_t map_len; dax.sector = iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; map_len = dax_map_atomic(iomap->bdev, &dax); if (map_len < 0) { ret = map_len; break; } dax.addr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (iov_iter_rw(iter) == WRITE) map_len = copy_from_iter_pmem(dax.addr, map_len, iter); else map_len = copy_to_iter(dax.addr, map_len, iter); dax_unmap_atomic(iomap->bdev, &dax); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; } pos += map_len; length -= map_len; done += map_len; } return done ? done : ret; } /** * iomap_dax_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O. */ ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos, ret = 0, done = 0; unsigned flags = 0; if (iov_iter_rw(iter) == WRITE) flags |= IOMAP_WRITE; /* * Yes, even DAX files can have page cache attached to them: A zeroed * page is inserted into the pagecache when we have to serve a write * fault on a hole. It should never be dirtied and can simply be * dropped from the pagecache once we get real data for the page. * * XXX: This is racy against mmap, and there's nothing we can do about * it. We'll eventually need to shift this down even further so that * we can check if we allocated blocks over a hole first. */ if (mapping->nrpages) { ret = invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); WARN_ON_ONCE(ret); } while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, iter, iomap_dax_actor); if (ret <= 0) break; pos += ret; done += ret; } iocb->ki_pos += done; return done ? done : ret; } EXPORT_SYMBOL_GPL(iomap_dax_rw); /** * iomap_dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @ops: iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in their fault * or mkwrite handler for DAX files. Assumes the caller has done all the * necessary locking for the page fault to proceed successfully. */ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = (unsigned long)vmf->virtual_address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; unsigned flags = 0; int error, major = 0; void *entry; /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (pos >= i_size_read(inode)) return VM_FAULT_SIGBUS; entry = grab_mapping_entry(mapping, vmf->pgoff); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto out; } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); if (error) goto unlock_entry; if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { error = -EIO; /* fs corruption? */ goto unlock_entry; } sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); if (vmf->cow_page) { switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, vmf->cow_page, vaddr); break; default: WARN_ON_ONCE(1); error = -EIO; break; } if (error) goto unlock_entry; if (!radix_tree_exceptional_entry(entry)) { vmf->page = entry; return VM_FAULT_LOCKED; } vmf->entry = entry; return VM_FAULT_DAX_LOCKED; } switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, sector, PAGE_SIZE, &entry, vma, vmf); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) return dax_load_hole(mapping, entry, vmf); /*FALLTHRU*/ default: WARN_ON_ONCE(1); error = -EIO; break; } unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; } EXPORT_SYMBOL_GPL(iomap_dax_fault); #endif /* CONFIG_FS_IOMAP */
fs/ext2/Kconfig +1 −0 Original line number Diff line number Diff line config EXT2_FS tristate "Second extended fs support" select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. Loading
fs/ext2/ext2.h +1 −0 Original line number Diff line number Diff line Loading @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; extern struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; Loading
fs/ext2/file.c +69 −7 Original line number Diff line number Diff line Loading @@ -22,11 +22,59 @@ #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/iomap.h> #include <linux/uio.h> #include "ext2.h" #include "xattr.h" #include "acl.h" #ifdef CONFIG_FS_DAX static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; ssize_t ret; if (!iov_iter_count(to)) return 0; /* skip atime */ inode_lock_shared(inode); ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out_unlock; ret = file_remove_privs(file); if (ret) goto out_unlock; ret = file_update_time(file); if (ret) goto out_unlock; ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); } out_unlock: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } /* * The lock ordering for ext2 DAX fault paths is: * Loading @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); ret = dax_fault(vma, vmf, ext2_get_block); ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) Loading Loading @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } /* * We have mostly NULL's here: the current defaults are ok for * the ext2 filesystem. */ static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { #ifdef CONFIG_FS_DAX if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_read_iter(iocb, to); #endif return generic_file_read_iter(iocb, to); } static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { #ifdef CONFIG_FS_DAX if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_write_iter(iocb, from); #endif return generic_file_write_iter(iocb, from); } const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .read_iter = ext2_file_read_iter, .write_iter = ext2_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, Loading
fs/ext2/inode.c +81 −19 Original line number Diff line number Diff line Loading @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> #include "ext2.h" Loading Loading @@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode, */ static int ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, u32 *bno, bool *new, bool *boundary, int create) { int err = -EIO; Loading @@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); /* What's this do? */ count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { Loading Loading @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; clear_buffer_new(bh_result); goto got_it; } } Loading Loading @@ -745,15 +744,16 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } } else set_buffer_new(bh_result); } else { *new = true; } ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) set_buffer_boundary(bh_result); *boundary = true; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ Loading @@ -765,19 +765,82 @@ static int ext2_get_blocks(struct inode *inode, return err; } int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int ret = ext2_get_blocks(inode, iblock, max_blocks, bh_result, create); if (ret > 0) { bool new = false, boundary = false; u32 bno; int ret; ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, create); if (ret <= 0) return ret; map_bh(bh_result, inode->i_sb, bno); bh_result->b_size = (ret << inode->i_blkbits); ret = 0; if (new) set_buffer_new(bh_result); if (boundary) set_buffer_boundary(bh_result); return 0; } #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; bool new = false, boundary = false; u32 bno; int ret; ret = ext2_get_blocks(inode, first_block, max_blocks, &bno, &new, &boundary, flags & IOMAP_WRITE); if (ret < 0) return ret; iomap->flags = 0; iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; if (ret == 0) { iomap->type = IOMAP_HOLE; iomap->blkno = IOMAP_NULL_BLOCK; iomap->length = 1 << blkbits; } else { iomap->type = IOMAP_MAPPED; iomap->blkno = (sector_t)bno << (blkbits - 9); iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } if (new) iomap->flags |= IOMAP_F_NEW; return 0; } static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) ext2_write_failed(inode->i_mapping, offset + length); return 0; } struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { Loading Loading @@ -863,10 +926,9 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, DIO_LOCKING); else if (WARN_ON_ONCE(IS_DAX(inode))) return -EIO; ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); Loading