Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fad61490 authored by Trond Myklebust's avatar Trond Myklebust
Browse files

nfs: Use UNSTABLE + COMMIT for NFS O_DIRECT writes



Currently NFS O_DIRECT writes use FILE_SYNC so that a COMMIT is not
necessary.  This simplifies the internal logic, but this could be a
difficult workload for some servers.

Instead, let's send UNSTABLE writes, and after they all complete, send a
COMMIT for the dirty range.  After the COMMIT returns successfully, then do
the wake_up or fire off aio_complete().

Test plan:
Async direct I/O tests against Solaris (or any server that requires
committed unstable writes).  Reboot server during test.

Based on an earlier patch by Chuck Lever <cel@netapp.com>

Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>
parent e17b1fc4
Loading
Loading
Loading
Loading
+199 −25
Original line number Original line Diff line number Diff line
@@ -69,11 +69,15 @@ struct nfs_direct_req {
	struct kref		kref;		/* release manager */
	struct kref		kref;		/* release manager */


	/* I/O parameters */
	/* I/O parameters */
	struct list_head	list;		/* nfs_read/write_data structs */
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
	struct file *		filp;		/* file descriptor */
	struct file *		filp;		/* file descriptor */
	struct kiocb *		iocb;		/* controlling i/o request */
	struct kiocb *		iocb;		/* controlling i/o request */
	wait_queue_head_t	wait;		/* wait for i/o completion */
	wait_queue_head_t	wait;		/* wait for i/o completion */
	struct inode *		inode;		/* target file of i/o */
	struct inode *		inode;		/* target file of i/o */
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
	struct page **		pages;		/* pages in our buffer */
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
	unsigned int		npages;		/* count of pages */


@@ -82,8 +86,18 @@ struct nfs_direct_req {
	int			outstanding;	/* i/os we're waiting for */
	int			outstanding;	/* i/os we're waiting for */
	ssize_t			count,		/* bytes actually processed */
	ssize_t			count,		/* bytes actually processed */
				error;		/* any reported error */
				error;		/* any reported error */

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
};
};


static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);

/**
/**
 * nfs_direct_IO - NFS address space operation for direct I/O
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @rw: direction (read or write)
@@ -160,11 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
	kref_init(&dreq->kref);
	kref_init(&dreq->kref);
	init_waitqueue_head(&dreq->wait);
	init_waitqueue_head(&dreq->wait);
	INIT_LIST_HEAD(&dreq->list);
	INIT_LIST_HEAD(&dreq->list);
	INIT_LIST_HEAD(&dreq->rewrite_list);
	dreq->iocb = NULL;
	dreq->iocb = NULL;
	spin_lock_init(&dreq->lock);
	spin_lock_init(&dreq->lock);
	dreq->outstanding = 0;
	dreq->outstanding = 0;
	dreq->count = 0;
	dreq->count = 0;
	dreq->error = 0;
	dreq->error = 0;
	dreq->flags = 0;


	return dreq;
	return dreq;
}
}
@@ -299,7 +315,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
 * an NFS READ operation
 */
 */
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
{
{
	struct file *file = dreq->filp;
	struct file *file = dreq->filp;
	struct inode *inode = file->f_mapping->host;
	struct inode *inode = file->f_mapping->host;
@@ -307,11 +323,13 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
							file->private_data;
							file->private_data;
	struct list_head *list = &dreq->list;
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	struct page **pages = dreq->pages;
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
	size_t rsize = NFS_SERVER(inode)->rsize;
	size_t rsize = NFS_SERVER(inode)->rsize;
	unsigned int curpage, pgbase;
	unsigned int curpage, pgbase;


	curpage = 0;
	curpage = 0;
	pgbase = user_addr & ~PAGE_MASK;
	pgbase = dreq->user_addr & ~PAGE_MASK;
	do {
	do {
		struct nfs_read_data *data;
		struct nfs_read_data *data;
		size_t bytes;
		size_t bytes;
@@ -373,6 +391,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
	if (!dreq)
	if (!dreq)
		return -ENOMEM;
		return -ENOMEM;


	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
	dreq->pages = pages;
	dreq->pages = pages;
	dreq->npages = nr_pages;
	dreq->npages = nr_pages;
	igrab(inode);
	igrab(inode);
@@ -383,13 +404,137 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size


	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
	rpc_clnt_sigmask(clnt, &oldset);
	rpc_clnt_sigmask(clnt, &oldset);
	nfs_direct_read_schedule(dreq, user_addr, count, pos);
	nfs_direct_read_schedule(dreq);
	result = nfs_direct_wait(dreq);
	result = nfs_direct_wait(dreq);
	rpc_clnt_sigunmask(clnt, &oldset);
	rpc_clnt_sigunmask(clnt, &oldset);


	return result;
	return result;
}
}


static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
{
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
	struct list_head *pos;

	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
	dreq->count = 0;

	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}

	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
}

static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
	struct file *file = dreq->filp;
	struct nfs_open_context *ctx = (struct nfs_open_context *)
							file->private_data;
	struct nfs_write_data *data = dreq->commit_data;
	struct rpc_task *task = &data->task;

	data->inode = dreq->inode;
	data->cred = ctx->cred;

	data->args.fh = NFS_FH(data->inode);
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;

	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);

	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;

	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);

	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;

	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
			break;
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}

static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif

static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
{
{
	struct list_head *list;
	struct list_head *list;
@@ -424,14 +569,13 @@ static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize
			break;
			break;
		nbytes -= wsize;
		nbytes -= wsize;
	}
	}

	nfs_alloc_commit_data(dreq);

	kref_get(&dreq->kref);
	kref_get(&dreq->kref);
	return dreq;
	return dreq;
}
}


/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{
{
	struct nfs_write_data *data = calldata;
	struct nfs_write_data *data = calldata;
@@ -440,41 +584,62 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)


	if (nfs_writeback_done(task, data) != 0)
	if (nfs_writeback_done(task, data) != 0)
		return;
		return;
	/* If the server fell back to an UNSTABLE write, it's an error. */
	if (unlikely(data->res.verf->committed != NFS_FILE_SYNC))
		status = -EIO;


	spin_lock(&dreq->lock);
	spin_lock(&dreq->lock);


	if (likely(status >= 0))
	if (likely(status >= 0))
		dreq->count += data->res.count;
		dreq->count += data->res.count;
	else
	else
		dreq->error = status;
		dreq->error = task->tk_status;

	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
				break;
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
		}
	}
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;


	spin_unlock(&dreq->lock);
}

/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
static void nfs_direct_write_release(void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	spin_lock(&dreq->lock);
	if (--dreq->outstanding) {
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		spin_unlock(&dreq->lock);
		return;
		return;
	}
	}

	spin_unlock(&dreq->lock);
	spin_unlock(&dreq->lock);


	nfs_end_data_update(data->inode);
	nfs_direct_write_complete(dreq, data->inode);
	nfs_direct_complete(dreq);
}
}


static const struct rpc_call_ops nfs_write_direct_ops = {
static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
	.rpc_call_done = nfs_direct_write_result,
	.rpc_release = nfs_writedata_release,
	.rpc_release = nfs_direct_write_release,
};
};


/*
/*
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
 * an NFS WRITE operation
 *
 * XXX: For now, support only FILE_SYNC writes.  Later we may add
 *      support for UNSTABLE + COMMIT.
 */
 */
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
{
{
	struct file *file = dreq->filp;
	struct file *file = dreq->filp;
	struct inode *inode = file->f_mapping->host;
	struct inode *inode = file->f_mapping->host;
@@ -482,11 +647,13 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
							file->private_data;
							file->private_data;
	struct list_head *list = &dreq->list;
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	struct page **pages = dreq->pages;
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
	size_t wsize = NFS_SERVER(inode)->wsize;
	size_t wsize = NFS_SERVER(inode)->wsize;
	unsigned int curpage, pgbase;
	unsigned int curpage, pgbase;


	curpage = 0;
	curpage = 0;
	pgbase = user_addr & ~PAGE_MASK;
	pgbase = dreq->user_addr & ~PAGE_MASK;
	do {
	do {
		struct nfs_write_data *data;
		struct nfs_write_data *data;
		size_t bytes;
		size_t bytes;
@@ -496,7 +663,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
			bytes = count;
			bytes = count;


		data = list_entry(list->next, struct nfs_write_data, pages);
		data = list_entry(list->next, struct nfs_write_data, pages);
		list_del_init(&data->pages);
		list_move_tail(&data->pages, &dreq->rewrite_list);


		data->inode = inode;
		data->inode = inode;
		data->cred = ctx->cred;
		data->cred = ctx->cred;
@@ -512,7 +679,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long


		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
				&nfs_write_direct_ops, data);
		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
		NFS_PROTO(inode)->write_setup(data, sync);


		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
		data->task.tk_cookie = (unsigned long) inode;
@@ -544,11 +711,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
	struct inode *inode = iocb->ki_filp->f_mapping->host;
	struct inode *inode = iocb->ki_filp->f_mapping->host;
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
	struct nfs_direct_req *dreq;
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;


	dreq = nfs_direct_write_alloc(count, NFS_SERVER(inode)->wsize);
	dreq = nfs_direct_write_alloc(count, wsize);
	if (!dreq)
	if (!dreq)
		return -ENOMEM;
		return -ENOMEM;
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;


	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
	dreq->pages = pages;
	dreq->pages = pages;
	dreq->npages = nr_pages;
	dreq->npages = nr_pages;
	igrab(inode);
	igrab(inode);
@@ -562,7 +736,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
	nfs_begin_data_update(inode);
	nfs_begin_data_update(inode);


	rpc_clnt_sigmask(clnt, &oldset);
	rpc_clnt_sigmask(clnt, &oldset);
	nfs_direct_write_schedule(dreq, user_addr, count, pos);
	nfs_direct_write_schedule(dreq, sync);
	result = nfs_direct_wait(dreq);
	result = nfs_direct_wait(dreq);
	rpc_clnt_sigunmask(clnt, &oldset);
	rpc_clnt_sigunmask(clnt, &oldset);


+1 −0
Original line number Original line Diff line number Diff line
@@ -422,6 +422,7 @@ void nfs_commit_free(struct nfs_write_data *p);
extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
extern int  nfs_commit_inode(struct inode *, int);
extern int  nfs_commit_inode(struct inode *, int);
extern void nfs_commit_release(void *wdata);
#else
#else
static inline int
static inline int
nfs_commit_inode(struct inode *inode, int how)
nfs_commit_inode(struct inode *inode, int how)