Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 60b11392 authored by Mark Fasheh's avatar Mark Fasheh
Browse files

ocfs2: zero tail of sparse files on truncate



Since we don't zero on extend anymore, truncate needs to be fixed up to zero
the part of a file between i_size and and end of it's cluster. Otherwise a
subsequent extend could expose bad data.

This introduced a new helper, which can be used in ocfs2_write().

Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
parent 25baf2da
Loading
Loading
Loading
Loading
+224 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/swap.h>

#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -34,6 +35,7 @@
#include "ocfs2.h"

#include "alloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
@@ -3342,6 +3344,228 @@ bail:
	return status;
}

static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
{
	set_buffer_uptodate(bh);
	mark_buffer_dirty(bh);
	return 0;
}

static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
{
	set_buffer_uptodate(bh);
	mark_buffer_dirty(bh);
	return ocfs2_journal_dirty_data(handle, bh);
}

static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
				     struct page **pages, int numpages,
				     u64 phys, handle_t *handle)
{
	int i, ret, partial = 0;
	void *kaddr;
	struct page *page;
	unsigned int from, to = PAGE_CACHE_SIZE;
	struct super_block *sb = inode->i_sb;

	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));

	if (numpages == 0)
		goto out;

	from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
	if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
		/*
		 * Since 'from' has been capped to a value below page
		 * size, this calculation won't be able to overflow
		 * 'to'
		 */
		to = ocfs2_align_bytes_to_clusters(sb, from);

		/*
		 * The truncate tail in this case should never contain
		 * more than one page at maximum. The loop below also
		 * assumes this.
		 */
		BUG_ON(numpages != 1);
	}

	for(i = 0; i < numpages; i++) {
		page = pages[i];

		BUG_ON(from > PAGE_CACHE_SIZE);
		BUG_ON(to > PAGE_CACHE_SIZE);

		ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
		if (ret)
			mlog_errno(ret);

		kaddr = kmap_atomic(page, KM_USER0);
		memset(kaddr + from, 0, to - from);
		kunmap_atomic(kaddr, KM_USER0);

		/*
		 * Need to set the buffers we zero'd into uptodate
		 * here if they aren't - ocfs2_map_page_blocks()
		 * might've skipped some
		 */
		if (ocfs2_should_order_data(inode)) {
			ret = walk_page_buffers(handle,
						page_buffers(page),
						from, to, &partial,
						ocfs2_ordered_zero_func);
			if (ret < 0)
				mlog_errno(ret);
		} else {
			ret = walk_page_buffers(handle, page_buffers(page),
						from, to, &partial,
						ocfs2_writeback_zero_func);
			if (ret < 0)
				mlog_errno(ret);
		}

		if (!partial)
			SetPageUptodate(page);

		flush_dcache_page(page);

		/*
		 * Every page after the 1st one should be completely zero'd.
		 */
		from = 0;
	}
out:
	if (pages) {
		for (i = 0; i < numpages; i++) {
			page = pages[i];
			unlock_page(page);
			mark_page_accessed(page);
			page_cache_release(page);
		}
	}
}

static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
				int *num, u64 *phys)
{
	int i, numpages = 0, ret = 0;
	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
	struct super_block *sb = inode->i_sb;
	struct address_space *mapping = inode->i_mapping;
	unsigned long index;
	u64 next_cluster_bytes;

	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));

	/* Cluster boundary, so we don't need to grab any pages. */
	if ((isize & (csize - 1)) == 0)
		goto out;

	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
					  phys, NULL);
	if (ret) {
		mlog_errno(ret);
		goto out;
	}

	/* Tail is a hole. */
	if (*phys == 0)
		goto out;

	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
	index = isize >> PAGE_CACHE_SHIFT;
	do {
		pages[numpages] = grab_cache_page(mapping, index);
		if (!pages[numpages]) {
			ret = -ENOMEM;
			mlog_errno(ret);
			goto out;
		}

		numpages++;
		index++;
	} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));

out:
	if (ret != 0) {
		if (pages) {
			for (i = 0; i < numpages; i++) {
				if (pages[i]) {
					unlock_page(pages[i]);
					page_cache_release(pages[i]);
				}
			}
		}
		numpages = 0;
	}

	*num = numpages;

	return ret;
}

/*
 * Zero the area past i_size but still within an allocated
 * cluster. This avoids exposing nonzero data on subsequent file
 * extends.
 *
 * We need to call this before i_size is updated on the inode because
 * otherwise block_write_full_page() will skip writeout of pages past
 * i_size. The new_i_size parameter is passed for this reason.
 */
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
				 u64 new_i_size)
{
	int ret, numpages;
	struct page **pages = NULL;
	u64 phys;

	/*
	 * File systems which don't support sparse files zero on every
	 * extend.
	 */
	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
		return 0;

	pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
			sizeof(struct page *), GFP_NOFS);
	if (pages == NULL) {
		ret = -ENOMEM;
		mlog_errno(ret);
		goto out;
	}

	ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
	if (ret) {
		mlog_errno(ret);
		goto out;
	}

	/*
	 * Truncate on an i_size boundary - nothing more to do.
	 */
	if (numpages == 0)
		goto out;

	ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
				 handle);

	/*
	 * Initiate writeout of the pages we zero'd here. We don't
	 * wait on them - the truncate_inode_pages() call later will
	 * do that for us.
	 */
	ret = filemap_fdatawrite(inode->i_mapping);
	if (ret)
		mlog_errno(ret);

out:
	if (pages)
		kfree(pages);

	return ret;
}

/*
 * It is expected, that by the time you call this function,
 * inode->i_size and fe->i_size have been adjusted.
+2 −0
Original line number Diff line number Diff line
@@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
	struct buffer_head *tc_last_eb_bh;
};

int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
				 u64 new_i_size);
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
			   struct inode *inode,
			   struct buffer_head *fe_bh,
+15 −19
Original line number Diff line number Diff line
@@ -308,7 +308,7 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
 * their fixes when they happen) --Mark */
static int walk_page_buffers(	handle_t *handle,
int walk_page_buffers(	handle_t *handle,
			struct buffer_head *head,
			unsigned from,
			unsigned to,
@@ -654,7 +654,7 @@ static void ocfs2_clear_page_regions(struct page *page,
 *
 * This will also skip zeroing, which is handled externally.
 */
static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
			  struct inode *inode, unsigned int from,
			  unsigned int to, int new)
{
@@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
		 * Ignore blocks outside of our i/o range -
		 * they may belong to unallocated clusters.
		 */
		if (block_start >= to ||
		    (block_start + bsize) <= from) {
		if (block_start >= to || block_end <= from) {
			if (PageUptodate(page))
				set_buffer_uptodate(bh);
			continue;
@@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
	u64 v_blkno, p_blkno;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	unsigned long index, start;
	struct page **cpages;

@@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,

	/*
	 * Figure out how many pages we'll be manipulating here. For
	 * non-allocating write, or any writes where cluster size is
	 * less than page size, we only need one page. Otherwise,
	 * allocating writes of cluster size larger than page size
	 * need cluster size pages.
	 * non allocating write, we just change the one
	 * page. Otherwise, we'll need a whole clusters worth.
	 */
	if (new && !wc->w_large_pages)
		numpages = (1 << cbits) / PAGE_SIZE;
	if (new)
		numpages = ocfs2_pages_per_cluster(inode->i_sb);

	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
	if (!cpages) {
+12 −0
Original line number Diff line number Diff line
@@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
							 unsigned from,
							 unsigned to);

int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
			  struct inode *inode, unsigned int from,
			  unsigned int to, int new);

int walk_page_buffers(	handle_t *handle,
			struct buffer_head *head,
			unsigned from,
			unsigned to,
			int *partial,
			int (*fn)(	handle_t *handle,
					struct buffer_head *bh));

struct ocfs2_write_ctxt;
typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
				u64 *, unsigned int *, unsigned int *);
+35 −5
Original line number Diff line number Diff line
@@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
{
	int status;
	handle_t *handle;
	struct ocfs2_dinode *di;

	mlog_entry_void();

@@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
		goto out;
	}

	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
	status = ocfs2_journal_access(handle, inode, fe_bh,
				      OCFS2_JOURNAL_ACCESS_WRITE);
	if (status < 0) {
		mlog_errno(status);
		goto out_commit;
	}

	/*
	 * Do this before setting i_size.
	 */
	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
	if (status) {
		mlog_errno(status);
		goto out_commit;
	}

	i_size_write(inode, new_i_size);
	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
	inode->i_ctime = inode->i_mtime = CURRENT_TIME;

	di = (struct ocfs2_dinode *) fe_bh->b_data;
	di->i_size = cpu_to_le64(new_i_size);
	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);

	status = ocfs2_journal_dirty(handle, fe_bh);
	if (status < 0)
		mlog_errno(status);

out_commit:
	ocfs2_commit_trans(osb, handle);
out:

	mlog_exit(status);
	return status;
}
@@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
		mlog_errno(status);
		goto bail;
	}
	ocfs2_data_unlock(inode, 1);

	/* alright, we're going to need to do a full blown alloc size
	 * change. Orphan the inode so that recovery can complete the
@@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
		goto bail_unlock_data;
	}

	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
		goto bail_unlock_data;
	}

	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
		goto bail_unlock_data;
	}

	/* TODO: orphan dir cleanup here. */
bail_unlock_data:
	ocfs2_data_unlock(inode, 1);

bail:

	mlog_exit(status);
Loading