Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 53069d4e authored by Tristan Ye's avatar Tristan Ye
Browse files

Ocfs2/move_extents: move/defrag extents within a certain range.



the basic logic of moving extents for a file is pretty like punching-hole
sequence, walk the extents within the range as user specified, calculating
an appropriate len to defrag/move, then let ocfs2_defrag/move_extent() to
do the actual moving.

This func ends up setting 'OCFS2_MOVE_EXT_FL_COMPLETE' to userpace if operation
gets done successfully.

Signed-off-by: default avatarTristan Ye <tristan.ye@oracle.com>
parent ee16cc03
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#include "dir.h"
#include "buffer_head_io.h"
#include "suballoc.h"
#include "move_extents.h"

#include <linux/ext2_fs.h>

@@ -951,6 +952,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
			return -EFAULT;

		return ocfs2_info_handle(inode, &info, 0);
	case OCFS2_IOC_MOVE_EXT:
		return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
	default:
		return -ENOTTY;
	}
@@ -993,6 +996,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
			return -EFAULT;

		return ocfs2_info_handle(inode, &info, 1);
	case OCFS2_IOC_MOVE_EXT:
		break;
	default:
		return -ENOIOCTLCMD;
	}
+308 −0
Original line number Diff line number Diff line
@@ -827,3 +827,311 @@ static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
		*len_defraged = 0;
	}
}

static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
				struct ocfs2_move_extents_context *context)
{
	int ret = 0, flags, do_defrag, skip = 0;
	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;

	struct inode *inode = context->inode;
	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
	struct ocfs2_move_extents *range = context->range;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	if ((inode->i_size == 0) || (range->me_len == 0))
		return 0;

	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
		return 0;

	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);

	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
	ocfs2_init_dealloc_ctxt(&context->dealloc);

	/*
	 * TO-DO XXX:
	 *
	 * - xattr extents.
	 */

	do_defrag = context->auto_defrag;

	/*
	 * extents moving happens in unit of clusters, for the sake
	 * of simplicity, we may ignore two clusters where 'byte_start'
	 * and 'byte_start + len' were within.
	 */
	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
	len_to_move = (range->me_start + range->me_len) >>
						osb->s_clustersize_bits;
	if (len_to_move >= move_start)
		len_to_move -= move_start;
	else
		len_to_move = 0;

	if (do_defrag)
		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
	else
		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
							 range->me_goal);

	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
	     "thresh: %u\n",
	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
	     (unsigned long long)range->me_start,
	     (unsigned long long)range->me_len,
	     move_start, len_to_move, defrag_thresh);

	cpos = move_start;
	while (len_to_move) {
		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
					 &flags);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}

		if (alloc_size > len_to_move)
			alloc_size = len_to_move;

		/*
		 * XXX: how to deal with a hole:
		 *
		 * - skip the hole of course
		 * - force a new defragmentation
		 */
		if (!phys_cpos) {
			if (do_defrag)
				len_defraged = 0;

			goto next;
		}

		if (do_defrag) {
			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
						     defrag_thresh, &skip);
			/*
			 * skip large extents
			 */
			if (skip) {
				skip = 0;
				goto next;
			}

			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
			     "alloc_size: %u, len_defraged: %u\n",
			     cpos, phys_cpos, alloc_size, len_defraged);

			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
						  alloc_size, flags);
		} else {
			ret = ocfs2_move_extent(context, cpos, phys_cpos,
						&new_phys_cpos, alloc_size,
						flags);

			new_phys_cpos += alloc_size;
		}

		if (ret < 0) {
			mlog_errno(ret);
			goto out;
		}

		context->clusters_moved += alloc_size;
next:
		cpos += alloc_size;
		len_to_move -= alloc_size;
	}

	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;

out:
	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
						      context->clusters_moved);
	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
						       context->new_phys_cpos);

	ocfs2_schedule_truncate_log_flush(osb, 1);
	ocfs2_run_deallocs(osb, &context->dealloc);

	return ret;
}

static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
{
	int status;
	handle_t *handle;
	struct inode *inode = context->inode;
	struct ocfs2_dinode *di;
	struct buffer_head *di_bh = NULL;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	if (!inode)
		return -ENOENT;

	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
		return -EROFS;

	mutex_lock(&inode->i_mutex);

	/*
	 * This prevents concurrent writes from other nodes
	 */
	status = ocfs2_rw_lock(inode, 1);
	if (status) {
		mlog_errno(status);
		goto out;
	}

	status = ocfs2_inode_lock(inode, &di_bh, 1);
	if (status) {
		mlog_errno(status);
		goto out_rw_unlock;
	}

	/*
	 * rememer ip_xattr_sem also needs to be held if necessary
	 */
	down_write(&OCFS2_I(inode)->ip_alloc_sem);

	status = __ocfs2_move_extents_range(di_bh, context);

	up_write(&OCFS2_I(inode)->ip_alloc_sem);
	if (status) {
		mlog_errno(status);
		goto out_inode_unlock;
	}

	/*
	 * We update ctime for these changes
	 */
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	if (IS_ERR(handle)) {
		status = PTR_ERR(handle);
		mlog_errno(status);
		goto out_inode_unlock;
	}

	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
					 OCFS2_JOURNAL_ACCESS_WRITE);
	if (status) {
		mlog_errno(status);
		goto out_commit;
	}

	di = (struct ocfs2_dinode *)di_bh->b_data;
	inode->i_ctime = CURRENT_TIME;
	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);

	ocfs2_journal_dirty(handle, di_bh);

out_commit:
	ocfs2_commit_trans(osb, handle);

out_inode_unlock:
	brelse(di_bh);
	ocfs2_inode_unlock(inode, 1);
out_rw_unlock:
	ocfs2_rw_unlock(inode, 1);
out:
	mutex_unlock(&inode->i_mutex);

	return status;
}

int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
{
	int status;

	struct inode *inode = filp->f_path.dentry->d_inode;
	struct ocfs2_move_extents range;
	struct ocfs2_move_extents_context *context = NULL;

	status = mnt_want_write(filp->f_path.mnt);
	if (status)
		return status;

	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
		goto out;

	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
		status = -EPERM;
		goto out;
	}

	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
	if (!context) {
		status = -ENOMEM;
		mlog_errno(status);
		goto out;
	}

	context->inode = inode;
	context->file = filp;

	if (argp) {
		if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
				   sizeof(range))) {
			status = -EFAULT;
			goto out;
		}
	} else {
		status = -EINVAL;
		goto out;
	}

	if (range.me_start > i_size_read(inode))
		goto out;

	if (range.me_start + range.me_len > i_size_read(inode))
			range.me_len = i_size_read(inode) - range.me_start;

	context->range = &range;

	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
		context->auto_defrag = 1;
		if (!range.me_threshold)
			/*
			 * ok, the default theshold for the defragmentation
			 * is 1M, since our maximum clustersize was 1M also.
			 * any thought?
			 */
			range.me_threshold = 1024 * 1024;
	} else {
		/*
		 * first best-effort attempt to validate and adjust the goal
		 * (physical address in block), while it can't guarantee later
		 * operation can succeed all the time since global_bitmap may
		 * change a bit over time.
		 */

		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
		if (status)
			goto out;
	}

	status = ocfs2_move_extents(context);
	if (status)
		mlog_errno(status);
out:
	/*
	 * movement/defragmentation may end up being partially completed,
	 * that's the reason why we need to return userspace the finished
	 * length and new_offset even if failure happens somewhere.
	 */
	if (argp) {
		if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
				sizeof(range)))
			status = -EFAULT;
	}

	kfree(context);

	mnt_drop_write(filp->f_path.mnt);

	return status;
}
+2 −0
Original line number Diff line number Diff line
@@ -17,4 +17,6 @@
#ifndef OCFS2_MOVE_EXTENTS_H
#define OCFS2_MOVE_EXTENTS_H

int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);

#endif /* OCFS2_MOVE_EXTENTS_H */