fs: introduce write_begin, write_end, and perform_write aops (afddba49) · Commits · e / devices / android_kernel_teracube_emerald

Documentation/filesystems/Locking

+6 −3

Original line number	Diff line number	Diff line
		@@ -178,15 +178,18 @@ prototypes:
		locking rules:
		All except set_page_dirty may block

		BKL PageLocked(page)
		BKL PageLocked(page) i_sem
		writepage: no yes, unlocks (see below)
		readpage: no yes, unlocks
		sync_page: no maybe
		writepages: no
		set_page_dirty no no
		readpages: no
		prepare_write: no yes
		commit_write: no yes
		prepare_write: no yes yes
		commit_write: no yes yes
		write_begin: no locks the page yes
		write_end: no yes, unlocks yes
		perform_write: no n/a yes
		bmap: yes
		invalidatepage: no yes
		releasepage: no yes

Documentation/filesystems/vfs.txt

+45 −0

Original line number	Diff line number	Diff line
		@@ -537,6 +537,12 @@ struct address_space_operations {
		struct list_head *pages, unsigned nr_pages);
		int (prepare_write)(struct file , struct page *, unsigned, unsigned);
		int (commit_write)(struct file , struct page *, unsigned, unsigned);
		int (write_begin)(struct file , struct address_space *mapping,
		loff_t pos, unsigned len, unsigned flags,
		struct page pagep, void fsdata);
		int (write_end)(struct file , struct address_space *mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page page, void fsdata);
		sector_t (bmap)(struct address_space , sector_t);
		int (invalidatepage) (struct page , unsigned long);
		int (releasepage) (struct page , int);
		@@ -633,6 +639,45 @@ struct address_space_operations {
		operations. It should avoid returning an error if possible -
		errors should have been handled by prepare_write.

		write_begin: This is intended as a replacement for prepare_write. The
		key differences being that:
		- it returns a locked page (in *pagep) rather than being
		given a pre locked page;
		- it must be able to cope with short writes (where the
		length passed to write_begin is greater than the number
		of bytes copied into the page).

		Called by the generic buffered write code to ask the filesystem to
		prepare to write len bytes at the given offset in the file. The
		address_space should check that the write will be able to complete,
		by allocating space if necessary and doing any other internal
		housekeeping. If the write will update parts of any basic-blocks on
		storage, then those blocks should be pre-read (if they haven't been
		read already) so that the updated blocks can be written out properly.

		The filesystem must return the locked pagecache page for the specified
		offset, in *pagep, for the caller to write into.

		flags is a field for AOP_FLAG_xxx flags, described in
		include/linux/fs.h.

		A void * may be returned in fsdata, which then gets passed into
		write_end.

		Returns 0 on success; < 0 on failure (which is the error code), in
		which case write_end is not called.

		write_end: After a successful write_begin, and data copy, write_end must
		be called. len is the original len passed to write_begin, and copied
		is the amount that was able to be copied (copied == len is always true
		if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).

		The filesystem must take care of unlocking the page and releasing it
		refcount, and updating i_size.

		Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
		that were able to be copied into pagecache.

		bmap: called by the VFS to map a logical block offset within object to
		physical block number. This method is used by the FIBMAP
		ioctl and for working with swap-files. To be able to swap to

drivers/block/loop.c

+29 −46

Original line number	Diff line number	Diff line
		@@ -204,14 +204,13 @@ lo_do_transfer(struct loop_device *lo, int cmd,
		* do_lo_send_aops - helper for writing data to a loop device
		*
		* This is the fast version for backing filesystems which implement the address
		* space operations prepare_write and commit_write.
		* space operations write_begin and write_end.
		*/
		static int do_lo_send_aops(struct loop_device lo, struct bio_vec bvec,
		int bsize, loff_t pos, struct page *page)
		int bsize, loff_t pos, struct page *unused)
		{
		struct file file = lo->lo_backing_file; / kudos to NFsckingS */
		struct address_space *mapping = file->f_mapping;
		const struct address_space_operations *aops = mapping->a_ops;
		pgoff_t index;
		unsigned offset, bv_offs;
		int len, ret;
		@@ -223,63 +222,47 @@ static int do_lo_send_aops(struct loop_device lo, struct bio_vec bvec,
		len = bvec->bv_len;
		while (len > 0) {
		sector_t IV;
		unsigned size;
		unsigned size, copied;
		int transfer_result;
		struct page *page;
		void *fsdata;

		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
		size = PAGE_CACHE_SIZE - offset;
		if (size > len)
		size = len;
		page = grab_cache_page(mapping, index);
		if (unlikely(!page))

		ret = pagecache_write_begin(file, mapping, pos, size, 0,
		&page, &fsdata);
		if (ret)
		goto fail;
		ret = aops->prepare_write(file, page, offset,
		offset + size);
		if (unlikely(ret)) {
		if (ret == AOP_TRUNCATED_PAGE) {
		page_cache_release(page);
		continue;
		}
		goto unlock;
		}

		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
		bvec->bv_page, bv_offs, size, IV);
		if (unlikely(transfer_result)) {
		/*
		* The transfer failed, but we still write the data to
		* keep prepare/commit calls balanced.
		*/
		printk(KERN_ERR "loop: transfer error block %llu\n",
		(unsigned long long)index);
		zero_user_page(page, offset, size, KM_USER0);
		}
		flush_dcache_page(page);
		ret = aops->commit_write(file, page, offset,
		offset + size);
		if (unlikely(ret)) {
		if (ret == AOP_TRUNCATED_PAGE) {
		page_cache_release(page);
		continue;
		}
		goto unlock;
		}
		copied = size;
		if (unlikely(transfer_result))
		copied = 0;

		ret = pagecache_write_end(file, mapping, pos, size, copied,
		page, fsdata);
		if (ret < 0)
		goto fail;
		if (ret < copied)
		copied = ret;

		if (unlikely(transfer_result))
		goto unlock;
		bv_offs += size;
		len -= size;
		goto fail;

		bv_offs += copied;
		len -= copied;
		offset = 0;
		index++;
		pos += size;
		unlock_page(page);
		page_cache_release(page);
		pos += copied;
		}
		ret = 0;
		out:
		mutex_unlock(&mapping->host->i_mutex);
		return ret;
		unlock:
		unlock_page(page);
		page_cache_release(page);
		fail:
		ret = -1;
		goto out;
		@@ -313,7 +296,7 @@ static int __do_lo_send_write(struct file *file,
		* do_lo_send_direct_write - helper for writing data to a loop device
		*
		* This is the fast, non-transforming version for backing filesystems which do
		* not implement the address space operations prepare_write and commit_write.
		* not implement the address space operations write_begin and write_end.
		* It uses the write file operation which should be present on all writeable
		* filesystems.
		*/
		@@ -332,7 +315,7 @@ static int do_lo_send_direct_write(struct loop_device *lo,
		* do_lo_send_write - helper for writing data to a loop device
		*
		* This is the slow, transforming version for filesystems which do not
		* implement the address space operations prepare_write and commit_write. It
		* implement the address space operations write_begin and write_end. It
		* uses the write file operation which should be present on all writeable
		* filesystems.
		*
		@@ -780,7 +763,7 @@ static int loop_set_fd(struct loop_device lo, struct file lo_file,
		*/
		if (!file->f_op->splice_read)
		goto out_putf;
		if (aops->prepare_write && aops->commit_write)
		if (aops->prepare_write \|\| aops->write_begin)
		lo_flags \|= LO_FLAGS_USE_AOPS;
		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
		lo_flags \|= LO_FLAGS_READ_ONLY;

fs/buffer.c

+169 −32

Original line number	Diff line number	Diff line
		@@ -1770,6 +1770,48 @@ static int __block_write_full_page(struct inode inode, struct page page,
		goto done;
		}

		/*
		* If a page has any new buffers, zero them out here, and mark them uptodate
		* and dirty so they'll be written out (in order to prevent uninitialised
		* block data from leaking). And clear the new bit.
		*/
		void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
		{
		unsigned int block_start, block_end;
		struct buffer_head head, bh;

		BUG_ON(!PageLocked(page));
		if (!page_has_buffers(page))
		return;

		bh = head = page_buffers(page);
		block_start = 0;
		do {
		block_end = block_start + bh->b_size;

		if (buffer_new(bh)) {
		if (block_end > from && block_start < to) {
		if (!PageUptodate(page)) {
		unsigned start, size;

		start = max(from, block_start);
		size = min(to, block_end) - start;

		zero_user_page(page, start, size, KM_USER0);
		set_buffer_uptodate(bh);
		}

		clear_buffer_new(bh);
		mark_buffer_dirty(bh);
		}
		}

		block_start = block_end;
		bh = bh->b_this_page;
		} while (bh != head);
		}
		EXPORT_SYMBOL(page_zero_new_buffers);

		static int __block_prepare_write(struct inode inode, struct page page,
		unsigned from, unsigned to, get_block_t *get_block)
		{
		@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode inode, struct page page,
		if (!buffer_uptodate(*wait_bh))
		err = -EIO;
		}
		if (!err) {
		bh = head;
		do {
		if (buffer_new(bh))
		clear_buffer_new(bh);
		} while ((bh = bh->b_this_page) != head);
		return 0;
		}
		/* Error case: */
		/*
		* Zero out any newly allocated blocks to avoid exposing stale
		* data. If BH_New is set, we know that the block was newly
		* allocated in the above loop.
		*/
		bh = head;
		block_start = 0;
		do {
		block_end = block_start+blocksize;
		if (block_end <= from)
		goto next_bh;
		if (block_start >= to)
		break;
		if (buffer_new(bh)) {
		clear_buffer_new(bh);
		zero_user_page(page, block_start, bh->b_size, KM_USER0);
		set_buffer_uptodate(bh);
		mark_buffer_dirty(bh);
		}
		next_bh:
		block_start = block_end;
		bh = bh->b_this_page;
		} while (bh != head);
		if (unlikely(err))
		page_zero_new_buffers(page, from, to);
		return err;
		}

		@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode inode, struct page page,
		set_buffer_uptodate(bh);
		mark_buffer_dirty(bh);
		}
		clear_buffer_new(bh);
		}

		/*
		@@ -1923,6 +1936,130 @@ static int __block_commit_write(struct inode inode, struct page page,
		return 0;
		}

		/*
		* block_write_begin takes care of the basic task of block allocation and
		* bringing partial write blocks uptodate first.
		*
		* If *pagep is not NULL, then block_write_begin uses the locked page
		* at *pagep rather than allocating its own. In this case, the page will
		* not be unlocked or deallocated on failure.
		*/
		int block_write_begin(struct file file, struct address_space mapping,
		loff_t pos, unsigned len, unsigned flags,
		struct page pagep, void fsdata,
		get_block_t *get_block)
		{
		struct inode *inode = mapping->host;
		int status = 0;
		struct page *page;
		pgoff_t index;
		unsigned start, end;
		int ownpage = 0;

		index = pos >> PAGE_CACHE_SHIFT;
		start = pos & (PAGE_CACHE_SIZE - 1);
		end = start + len;

		page = *pagep;
		if (page == NULL) {
		ownpage = 1;
		page = __grab_cache_page(mapping, index);
		if (!page) {
		status = -ENOMEM;
		goto out;
		}
		*pagep = page;
		} else
		BUG_ON(!PageLocked(page));

		status = __block_prepare_write(inode, page, start, end, get_block);
		if (unlikely(status)) {
		ClearPageUptodate(page);

		if (ownpage) {
		unlock_page(page);
		page_cache_release(page);
		*pagep = NULL;

		/*
		* prepare_write() may have instantiated a few blocks
		* outside i_size. Trim these off again. Don't need
		* i_size_read because we hold i_mutex.
		*/
		if (pos + len > inode->i_size)
		vmtruncate(inode, inode->i_size);
		}
		goto out;
		}

		out:
		return status;
		}
		EXPORT_SYMBOL(block_write_begin);

		int block_write_end(struct file file, struct address_space mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page page, void fsdata)
		{
		struct inode *inode = mapping->host;
		unsigned start;

		start = pos & (PAGE_CACHE_SIZE - 1);

		if (unlikely(copied < len)) {
		/*
		* The buffers that were written will now be uptodate, so we
		* don't have to worry about a readpage reading them and
		* overwriting a partial write. However if we have encountered
		* a short write and only partially written into a buffer, it
		* will not be marked uptodate, so a readpage might come in and
		* destroy our partial write.
		*
		* Do the simplest thing, and just treat any short write to a
		* non uptodate page as a zero-length write, and force the
		* caller to redo the whole thing.
		*/
		if (!PageUptodate(page))
		copied = 0;

		page_zero_new_buffers(page, start+copied, start+len);
		}
		flush_dcache_page(page);

		/* This could be a short (even 0-length) commit */
		__block_commit_write(inode, page, start, start+copied);

		return copied;
		}
		EXPORT_SYMBOL(block_write_end);

		int generic_write_end(struct file file, struct address_space mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page page, void fsdata)
		{
		struct inode *inode = mapping->host;

		copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

		/*
		* No need to use i_size_read() here, the i_size
		* cannot change under us because we hold i_mutex.
		*
		* But it's important to update i_size while still holding page lock:
		* page writeout could otherwise come in and zero beyond i_size.
		*/
		if (pos+copied > inode->i_size) {
		i_size_write(inode, pos+copied);
		mark_inode_dirty(inode);
		}

		unlock_page(page);
		page_cache_release(page);

		return copied;
		}
		EXPORT_SYMBOL(generic_write_end);

		/*
		* Generic "read page" function for block devices that have the normal
		* get_block functionality. This is most of the block device filesystems.

fs/libfs.c

+44 −0

Original line number	Diff line number	Diff line
		@@ -351,6 +351,26 @@ int simple_prepare_write(struct file file, struct page page,
		return 0;
		}

		int simple_write_begin(struct file file, struct address_space mapping,
		loff_t pos, unsigned len, unsigned flags,
		struct page pagep, void fsdata)
		{
		struct page *page;
		pgoff_t index;
		unsigned from;

		index = pos >> PAGE_CACHE_SHIFT;
		from = pos & (PAGE_CACHE_SIZE - 1);

		page = __grab_cache_page(mapping, index);
		if (!page)
		return -ENOMEM;

		*pagep = page;

		return simple_prepare_write(file, page, from, from+len);
		}

		int simple_commit_write(struct file file, struct page page,
		unsigned from, unsigned to)
		{
		@@ -369,6 +389,28 @@ int simple_commit_write(struct file file, struct page page,
		return 0;
		}

		int simple_write_end(struct file file, struct address_space mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page page, void fsdata)
		{
		unsigned from = pos & (PAGE_CACHE_SIZE - 1);

		/* zero the stale part of the page if we did a short copy */
		if (copied < len) {
		void *kaddr = kmap_atomic(page, KM_USER0);
		memset(kaddr + from + copied, 0, len - copied);
		flush_dcache_page(page);
		kunmap_atomic(kaddr, KM_USER0);
		}

		simple_commit_write(file, page, from, from+copied);

		unlock_page(page);
		page_cache_release(page);

		return copied;
		}

		/*
		* the inodes created here are not hashed. If you use iunique to generate
		* unique inode values later for this filesystem, then you must take care
		@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open);
		EXPORT_SYMBOL(dcache_readdir);
		EXPORT_SYMBOL(generic_read_dir);
		EXPORT_SYMBOL(get_sb_pseudo);
		EXPORT_SYMBOL(simple_write_begin);
		EXPORT_SYMBOL(simple_write_end);
		EXPORT_SYMBOL(simple_commit_write);
		EXPORT_SYMBOL(simple_dir_inode_operations);
		EXPORT_SYMBOL(simple_dir_operations);