Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5693486b authored by Joel Becker's avatar Joel Becker
Browse files

ocfs2: Zero the tail cluster when extending past i_size.



ocfs2's allocation unit is the cluster.  This can be larger than a block
or even a memory page.  This means that a file may have many blocks in
its last extent that are beyond the block containing i_size.  There also
may be more unwritten extents after that.

When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks.  Unfortunately,
block_write_full_page() drops the pages past i_size.  This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster.  This is a bug.

We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size.  They will use
ocfs2_zero_extend() to ensure the data is properly zeroed.

Older versions of ocfs2_zero_extend() simply zeroed every block between
i_size and the zeroing position.  This presumes three things:

1) There is allocation for all of these blocks.
2) The extents are not unwritten.
3) The extents are not refcounted.

(1) and (2) hold true for non-sparse filesystems, which used to be the
only users of ocfs2_zero_extend().  (3) is another bug.

Since we're now using ocfs2_zero_extend() for sparse filesystems as
well, we teach ocfs2_zero_extend() to check every extent between
i_size and the zeroing position.  If the extent is unwritten, it is
ignored.  If it is refcounted, it is CoWed.  Then it is zeroed.

Signed-off-by: default avatarJoel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
parent a4bfb4cf
Loading
Loading
Loading
Loading
+28 −14
Original line number Original line Diff line number Diff line
@@ -196,14 +196,13 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
			dump_stack();
			dump_stack();
			goto bail;
			goto bail;
		}
		}
	}


	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
	     (unsigned long long)past_eof);
	     (unsigned long long)past_eof);

	if (create && (iblock >= past_eof))
	if (create && (iblock >= past_eof))
		set_buffer_new(bh_result);
		set_buffer_new(bh_result);
	}


bail:
bail:
	if (err < 0)
	if (err < 0)
@@ -1590,21 +1589,20 @@ out:
 * write path can treat it as an non-allocating write, which has no
 * write path can treat it as an non-allocating write, which has no
 * special case code for sparse/nonsparse files.
 * special case code for sparse/nonsparse files.
 */
 */
static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
static int ocfs2_expand_nonsparse_inode(struct inode *inode,
					unsigned len,
					struct buffer_head *di_bh,
					loff_t pos, unsigned len,
					struct ocfs2_write_ctxt *wc)
					struct ocfs2_write_ctxt *wc)
{
{
	int ret;
	int ret;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	loff_t newsize = pos + len;
	loff_t newsize = pos + len;


	if (ocfs2_sparse_alloc(osb))
	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
		return 0;


	if (newsize <= i_size_read(inode))
	if (newsize <= i_size_read(inode))
		return 0;
		return 0;


	ret = ocfs2_extend_no_holes(inode, newsize, pos);
	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
	if (ret)
	if (ret)
		mlog_errno(ret);
		mlog_errno(ret);


@@ -1614,6 +1612,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
	return ret;
	return ret;
}
}


static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
			   loff_t pos)
{
	int ret = 0;

	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
	if (pos > i_size_read(inode))
		ret = ocfs2_zero_extend(inode, di_bh, pos);

	return ret;
}

int ocfs2_write_begin_nolock(struct address_space *mapping,
int ocfs2_write_begin_nolock(struct address_space *mapping,
			     loff_t pos, unsigned len, unsigned flags,
			     loff_t pos, unsigned len, unsigned flags,
			     struct page **pagep, void **fsdata,
			     struct page **pagep, void **fsdata,
@@ -1649,7 +1659,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
		}
		}
	}
	}


	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
	if (ocfs2_sparse_alloc(osb))
		ret = ocfs2_zero_tail(inode, di_bh, pos);
	else
		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
						   wc);
	if (ret) {
	if (ret) {
		mlog_errno(ret);
		mlog_errno(ret);
		goto out;
		goto out;
+166 −35
Original line number Original line Diff line number Diff line
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
	if (!zero_to)
	if (!zero_to)
		zero_to = PAGE_CACHE_SIZE;
		zero_to = PAGE_CACHE_SIZE;


	mlog(0,
	     "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
	     (unsigned long long)abs_from, (unsigned long long)abs_to,
	     index, zero_from, zero_to);

	/* We know that zero_from is block aligned */
	/* We know that zero_from is block aligned */
	for (block_start = zero_from; block_start < zero_to;
	for (block_start = zero_from; block_start < zero_to;
	     block_start = block_end) {
	     block_start = block_end) {
@@ -833,25 +838,114 @@ out:
	return ret;
	return ret;
}
}


static int ocfs2_zero_extend(struct inode *inode,
/*
			     u64 zero_to_size)
 * Find the next range to zero.  We do this in terms of bytes because
 * that's what ocfs2_zero_extend() wants, and it is dealing with the
 * pagecache.  We may return multiple extents.
 *
 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
 * needs to be zeroed.  range_start and range_end return the next zeroing
 * range.  A subsequent call should pass the previous range_end as its
 * zero_start.  If range_end is 0, there's nothing to do.
 *
 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
 */
static int ocfs2_zero_extend_get_range(struct inode *inode,
				       struct buffer_head *di_bh,
				       u64 zero_start, u64 zero_end,
				       u64 *range_start, u64 *range_end)
{
{
	int ret = 0;
	int rc = 0, needs_cow = 0;
	u64 start_off, next_off;
	u32 p_cpos, zero_clusters = 0;
	struct super_block *sb = inode->i_sb;
	u32 zero_cpos =
		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
	unsigned int num_clusters = 0;
	unsigned int ext_flags = 0;


	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
	while (zero_cpos < last_cpos) {
	while (start_off < zero_to_size) {
		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
		next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
					&num_clusters, &ext_flags);
		if (next_off > zero_to_size)
		if (rc) {
			next_off = zero_to_size;
			mlog_errno(rc);
		ret = ocfs2_write_zero_page(inode, start_off, next_off);
		if (ret < 0) {
			mlog_errno(ret);
			goto out;
			goto out;
		}
		}


		start_off = next_off;
		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
			zero_clusters = num_clusters;
			if (ext_flags & OCFS2_EXT_REFCOUNTED)
				needs_cow = 1;
			break;
		}

		zero_cpos += num_clusters;
	}
	if (!zero_clusters) {
		*range_end = 0;
		goto out;
	}

	while ((zero_cpos + zero_clusters) < last_cpos) {
		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
					&p_cpos, &num_clusters,
					&ext_flags);
		if (rc) {
			mlog_errno(rc);
			goto out;
		}

		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
			break;
		if (ext_flags & OCFS2_EXT_REFCOUNTED)
			needs_cow = 1;
		zero_clusters += num_clusters;
	}
	if ((zero_cpos + zero_clusters) > last_cpos)
		zero_clusters = last_cpos - zero_cpos;

	if (needs_cow) {
		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
					UINT_MAX);
		if (rc) {
			mlog_errno(rc);
			goto out;
		}
	}

	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
					     zero_cpos + zero_clusters);

out:
	return rc;
}

/*
 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
 * has made sure that the entire range needs zeroing.
 */
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
				   u64 range_end)
{
	int rc = 0;
	u64 next_pos;
	u64 zero_pos = range_start;

	mlog(0, "range_start = %llu, range_end = %llu\n",
	     (unsigned long long)range_start,
	     (unsigned long long)range_end);
	BUG_ON(range_start >= range_end);

	while (zero_pos < range_end) {
		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
		if (next_pos > range_end)
			next_pos = range_end;
		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
		if (rc < 0) {
			mlog_errno(rc);
			break;
		}
		zero_pos = next_pos;


		/*
		/*
		 * Very large extends have the potential to lock up
		 * Very large extends have the potential to lock up
@@ -860,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
		cond_resched();
		cond_resched();
	}
	}


out:
	return rc;
}

int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
		      loff_t zero_to_size)
{
	int ret = 0;
	u64 zero_start, range_start = 0, range_end = 0;
	struct super_block *sb = inode->i_sb;

	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
	mlog(0, "zero_start %llu for i_size %llu\n",
	     (unsigned long long)zero_start,
	     (unsigned long long)i_size_read(inode));
	while (zero_start < zero_to_size) {
		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
						  zero_to_size,
						  &range_start,
						  &range_end);
		if (ret) {
			mlog_errno(ret);
			break;
		}
		if (!range_end)
			break;
		/* Trim the ends */
		if (range_start < zero_start)
			range_start = zero_start;
		if (range_end > zero_to_size)
			range_end = zero_to_size;

		ret = ocfs2_zero_extend_range(inode, range_start,
					      range_end);
		if (ret) {
			mlog_errno(ret);
			break;
		}
		zero_start = range_end;
	}

	return ret;
	return ret;
}
}


int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
			  u64 new_i_size, u64 zero_to)
{
{
	int ret;
	int ret;
	u32 clusters_to_add;
	u32 clusters_to_add;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_inode_info *oi = OCFS2_I(inode);


	/*
	 * Only quota files call this without a bh, and they can't be
	 * refcounted.
	 */
	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));

	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
	if (clusters_to_add < oi->ip_clusters)
	if (clusters_to_add < oi->ip_clusters)
		clusters_to_add = 0;
		clusters_to_add = 0;
@@ -890,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
	 * still need to zero the area between the old i_size and the
	 * still need to zero the area between the old i_size and the
	 * new i_size.
	 * new i_size.
	 */
	 */
	ret = ocfs2_zero_extend(inode, zero_to);
	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
	if (ret < 0)
	if (ret < 0)
		mlog_errno(ret);
		mlog_errno(ret);


@@ -915,24 +1056,12 @@ static int ocfs2_extend_file(struct inode *inode,
		goto out;
		goto out;
	BUG_ON(new_i_size < i_size_read(inode));
	BUG_ON(new_i_size < i_size_read(inode));


	/*
	 * Fall through for converting inline data, even if the fs
	 * supports sparse files.
	 *
	 * The check for inline data here is legal - nobody can add
	 * the feature since we have i_mutex. We must check it again
	 * after acquiring ip_alloc_sem though, as paths like mmap
	 * might have raced us to converting the inode to extents.
	 */
	if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
		goto out_update_size;

	/*
	/*
	 * The alloc sem blocks people in read/write from reading our
	 * The alloc sem blocks people in read/write from reading our
	 * allocation until we're done changing it. We depend on
	 * allocation until we're done changing it. We depend on
	 * i_mutex to block other extend/truncate calls while we're
	 * i_mutex to block other extend/truncate calls while we're
	 * here.
	 * here.  We even have to hold it for sparse files because there
	 * might be some tail zeroing.
	 */
	 */
	down_write(&oi->ip_alloc_sem);
	down_write(&oi->ip_alloc_sem);


@@ -949,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
		if (ret) {
		if (ret) {
			up_write(&oi->ip_alloc_sem);
			up_write(&oi->ip_alloc_sem);

			mlog_errno(ret);
			mlog_errno(ret);
			goto out;
			goto out;
		}
		}
	}
	}


	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
		ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
	else
		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
					    new_i_size);


	up_write(&oi->ip_alloc_sem);
	up_write(&oi->ip_alloc_sem);


+4 −2
Original line number Original line Diff line number Diff line
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
int ocfs2_simple_size_update(struct inode *inode,
			     struct buffer_head *di_bh,
			     struct buffer_head *di_bh,
			     u64 new_i_size);
			     u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
			  u64 zero_to);
			  u64 new_i_size, u64 zero_to);
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
		      loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
		  struct kstat *stat);
		  struct kstat *stat);
+1 −1
Original line number Original line Diff line number Diff line
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
		 * locking allocators ranks above a transaction start
		 * locking allocators ranks above a transaction start
		 */
		 */
		WARN_ON(journal_current_handle());
		WARN_ON(journal_current_handle());
		status = ocfs2_extend_no_holes(gqinode,
		status = ocfs2_extend_no_holes(gqinode, NULL,
			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
			gqinode->i_size);
			gqinode->i_size);
		if (status < 0)
		if (status < 0)
+2 −2
Original line number Original line Diff line number Diff line
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
	u64 p_blkno;
	u64 p_blkno;


	/* We are protected by dqio_sem so no locking needed */
	/* We are protected by dqio_sem so no locking needed */
	status = ocfs2_extend_no_holes(lqinode,
	status = ocfs2_extend_no_holes(lqinode, NULL,
				       lqinode->i_size + 2 * sb->s_blocksize,
				       lqinode->i_size + 2 * sb->s_blocksize,
				       lqinode->i_size);
				       lqinode->i_size);
	if (status < 0) {
	if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
		return ocfs2_local_quota_add_chunk(sb, type, offset);
		return ocfs2_local_quota_add_chunk(sb, type, offset);


	/* We are protected by dqio_sem so no locking needed */
	/* We are protected by dqio_sem so no locking needed */
	status = ocfs2_extend_no_holes(lqinode,
	status = ocfs2_extend_no_holes(lqinode, NULL,
				       lqinode->i_size + sb->s_blocksize,
				       lqinode->i_size + sb->s_blocksize,
				       lqinode->i_size);
				       lqinode->i_size);
	if (status < 0) {
	if (status < 0) {
Loading