Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ed42fe05 authored by Al Viro's avatar Al Viro Committed by Mike Marshall
Browse files

orangefs: hopefully saner op refcounting and locking



* create with refcount 1
* make op_release() decrement and free if zero (i.e. old put_op()
  has become that).
* mark when submitter has given up waiting; from that point nobody
  else can move between the lists, change state, etc.
* have daemon read/write_iter grab a reference when picking op
  and *always* give it up in the end
* don't put into hash until we know it's been successfully passed to
  daemon

* move op->lock _lower_ than htab_in_progress_lock (and make sure
  to take it in purge_inprogress_ops())

Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
Signed-off-by: default avatarMike Marshall <hubcap@omnibond.com>
parent fee25ce1
Loading
Loading
Loading
Loading
+60 −52
Original line number Diff line number Diff line
@@ -43,9 +43,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
{
	int index = hash_func(op->tag, hash_table_size);

	spin_lock(&htable_ops_in_progress_lock);
	list_add_tail(&op->list, &htable_ops_in_progress[index]);
	spin_unlock(&htable_ops_in_progress_lock);
}

static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
@@ -60,8 +58,9 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
				 next,
				 &htable_ops_in_progress[index],
				 list) {
		if (op->tag == tag) {
			list_del(&op->list);
		if (op->tag == tag && !op_state_purged(op)) {
			list_del_init(&op->list);
			get_op(op); /* increase ref count. */
			spin_unlock(&htable_ops_in_progress_lock);
			return op;
		}
@@ -127,12 +126,17 @@ static ssize_t orangefs_devreq_read(struct file *file,
		return -EINVAL;
	}

restart:
	/* Get next op (if any) from top of list. */
	spin_lock(&orangefs_request_list_lock);
	list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
		__s32 fsid;
		/* This lock is held past the end of the loop when we break. */
		spin_lock(&op->lock);
		if (unlikely(op_state_purged(op))) {
			spin_unlock(&op->lock);
			continue;
		}

		fsid = fsid_of_op(op);
		if (fsid != ORANGEFS_FS_ID_NULL) {
@@ -197,16 +201,10 @@ static ssize_t orangefs_devreq_read(struct file *file,
		spin_unlock(&orangefs_request_list_lock);
		return -EAGAIN;
	}

	/*
	 * Set the operation to be in progress and move it between lists since
	 * it has been sent to the client.
	 */
	set_op_state_inprogress(cur_op);

	list_del(&cur_op->list);
	list_del_init(&cur_op->list);
	get_op(op);
	spin_unlock(&orangefs_request_list_lock);
	orangefs_devreq_add_op(cur_op);

	spin_unlock(&cur_op->lock);

	/* Push the upcall out. */
@@ -224,6 +222,25 @@ static ssize_t orangefs_devreq_read(struct file *file,
	if (ret != 0)
		goto error;

	spin_lock(&htable_ops_in_progress_lock);
	spin_lock(&cur_op->lock);
	if (unlikely(op_state_given_up(cur_op))) {
		spin_unlock(&cur_op->lock);
		spin_unlock(&htable_ops_in_progress_lock);
		op_release(cur_op);
		goto restart;
	}

	/*
	 * Set the operation to be in progress and move it between lists since
	 * it has been sent to the client.
	 */
	set_op_state_inprogress(cur_op);
	orangefs_devreq_add_op(cur_op);
	spin_unlock(&cur_op->lock);
	spin_unlock(&htable_ops_in_progress_lock);
	op_release(cur_op);

	/* The client only asks to read one size buffer. */
	return MAX_DEV_REQ_UPSIZE;
error:
@@ -235,11 +252,13 @@ static ssize_t orangefs_devreq_read(struct file *file,
	gossip_err("orangefs: Failed to copy data to user space\n");
	spin_lock(&orangefs_request_list_lock);
	spin_lock(&cur_op->lock);
	if (likely(!op_state_given_up(cur_op))) {
		set_op_state_waiting(cur_op);
	orangefs_devreq_remove_op(cur_op->tag);
		list_add(&cur_op->list, &orangefs_request_list);
	}
	spin_unlock(&cur_op->lock);
	spin_unlock(&orangefs_request_list_lock);
	op_release(cur_op);
	return -EFAULT;
}

@@ -278,15 +297,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
			   __func__,
			   total,
			   (unsigned int) MAX_DEV_REQ_DOWNSIZE);
		ret = -EFAULT;
		goto out;
		return -EFAULT;
	}
     
	n = copy_from_iter(&head, head_size, iter);
	if (n < head_size) {
		gossip_err("%s: failed to copy head.\n", __func__);
		ret = -EFAULT;
		goto out;
		return -EFAULT;
	}

	if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
@@ -295,31 +312,26 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
			   __func__,
			   head.version,
			   ORANGEFS_MINIMUM_USERSPACE_VERSION);
		ret = -EPROTO;
		goto out;
		return -EPROTO;
	}

	if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
		gossip_err("Error: Device magic number does not match.\n");
		ret = -EPROTO;
		goto out;
		return -EPROTO;
	}

	op = orangefs_devreq_remove_op(head.tag);
	if (!op) {
		gossip_err("WARNING: No one's waiting for tag %llu\n",
			   llu(head.tag));
		goto out;
		return ret;
	}

	get_op(op); /* increase ref count. */

	n = copy_from_iter(&op->downcall, downcall_size, iter);
	if (n != downcall_size) {
		gossip_err("%s: failed to copy downcall.\n", __func__);
		put_op(op);
		ret = -EFAULT;
		goto out;
		goto Broken;
	}

	if (op->downcall.status)
@@ -339,9 +351,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
			   downcall_size,
			   op->downcall.trailer_size,
			   total);
		put_op(op);
		ret = -EFAULT;
		goto out;
		goto Broken;
	}

	/* Only READDIR operations should have trailers. */
@@ -350,9 +361,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
		gossip_err("%s: %x operation with trailer.",
			   __func__,
			   op->downcall.type);
		put_op(op);
		ret = -EFAULT;
		goto out;
		goto Broken;
	}

	/* READDIR operations should always have trailers. */
@@ -361,9 +371,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
		gossip_err("%s: %x operation with no trailer.",
			   __func__,
			   op->downcall.type);
		put_op(op);
		ret = -EFAULT;
		goto out;
		goto Broken;
	}

	if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
@@ -374,9 +383,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
	if (op->downcall.trailer_buf == NULL) {
		gossip_err("%s: failed trailer vmalloc.\n",
			   __func__);
		put_op(op);
		ret = -ENOMEM;
		goto out;
		goto Broken;
	}
	memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
	n = copy_from_iter(op->downcall.trailer_buf,
@@ -385,9 +393,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
	if (n != op->downcall.trailer_size) {
		gossip_err("%s: failed to copy trailer.\n", __func__);
		vfree(op->downcall.trailer_buf);
		put_op(op);
		ret = -EFAULT;
		goto out;
		goto Broken;
	}

wakeup:
@@ -404,7 +411,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
	 * the buffers are done being used.
	 */
	if (op->downcall.type == ORANGEFS_VFS_OP_FILE_IO) {
		int timed_out = 0;
		DEFINE_WAIT(wait_entry);

		/*
@@ -412,6 +418,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
		 * that this op is done
		 */
		spin_lock(&op->lock);
		if (!op_state_given_up(op))
			set_op_state_serviced(op);
		spin_unlock(&op->lock);

@@ -435,7 +442,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
					gossip_debug(GOSSIP_DEV_DEBUG,
						"%s: timed out.\n",
						__func__);
					timed_out = 1;
					break;
				}
				continue;
@@ -450,15 +456,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
		spin_lock(&op->lock);
		finish_wait(&op->io_completion_waitq, &wait_entry);
		spin_unlock(&op->lock);

		/* NOTE: for I/O operations we handle releasing the op
		 * object except in the case of timeout.  the reason we
		 * can't free the op in timeout cases is that the op
		 * service logic in the vfs retries operations using
		 * the same op ptr, thus it can't be freed.
		 */
		if (!timed_out)
			op_release(op);
	} else {
		/*
		 * tell the vfs op waiting on a waitqueue that
@@ -468,11 +465,22 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
		 * notification
		 */
		spin_lock(&op->lock);
		if (!op_state_given_up(op))
			set_op_state_serviced(op);
		spin_unlock(&op->lock);
	}
out:
	op_release(op);
	return ret;

Broken:
	spin_lock(&op->lock);
	if (!op_state_given_up(op)) {
		op->downcall.status = ret;
		set_op_state_serviced(op);
	}
	spin_unlock(&op->lock);
	goto out;
}

/* Returns whether any FS are still pending remounted */
+5 −12
Original line number Diff line number Diff line
@@ -105,10 +105,9 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod
	ssize_t ret;

	new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
	if (!new_op) {
		ret = -ENOMEM;
		goto out;
	}
	if (!new_op)
		return -ENOMEM;

	/* synchronous I/O */
	new_op->upcall.req.io.async_vfs_io = ORANGEFS_VFS_SYNC_IO;
	new_op->upcall.req.io.readahead_size = readahead_size;
@@ -234,12 +233,9 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod

	/*
	 * tell the device file owner waiting on I/O that this read has
	 * completed and it can return now.  in this exact case, on
	 * wakeup the daemon will free the op, so we *cannot* touch it
	 * after this.
	 * completed and it can return now.
	 */
	wake_up_daemon_for_return(new_op);
	new_op = NULL;

out:
	if (buffer_index >= 0) {
@@ -249,10 +245,7 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod
			     __func__, handle, buffer_index);
		buffer_index = -1;
	}
	if (new_op) {
	op_release(new_op);
		new_op = NULL;
	}
	return ret;
}

+2 −3
Original line number Diff line number Diff line
@@ -120,7 +120,7 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
		init_waitqueue_head(&new_op->waitq);

		init_waitqueue_head(&new_op->io_completion_waitq);
		atomic_set(&new_op->ref_count, 0);
		atomic_set(&new_op->ref_count, 1);

		orangefs_op_initialize(new_op);

@@ -149,14 +149,13 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
	return new_op;
}

void op_release(struct orangefs_kernel_op_s *orangefs_op)
void __op_release(struct orangefs_kernel_op_s *orangefs_op)
{
	if (orangefs_op) {
		gossip_debug(GOSSIP_CACHE_DEBUG,
			     "Releasing OP (%p: %llu)\n",
			     orangefs_op,
			     llu(orangefs_op->tag));
		orangefs_op_initialize(orangefs_op);
		kmem_cache_free(op_cache, orangefs_op);
	} else {
		gossip_err("NULL pointer in op_release\n");
+21 −25
Original line number Diff line number Diff line
@@ -94,6 +94,7 @@ sizeof(__u64) + sizeof(struct orangefs_downcall_s))
 * serviced - op has matching downcall; ok
 * purged   - op has to start a timer since client-core
 *            exited uncleanly before servicing op
 * given up - submitter has given up waiting for it
 */
enum orangefs_vfs_op_states {
	OP_VFS_STATE_UNKNOWN = 0,
@@ -101,30 +102,9 @@ enum orangefs_vfs_op_states {
	OP_VFS_STATE_INPROGR = 2,
	OP_VFS_STATE_SERVICED = 4,
	OP_VFS_STATE_PURGED = 8,
	OP_VFS_STATE_GIVEN_UP = 16,
};

#define get_op(op)					\
	do {						\
		atomic_inc(&(op)->ref_count);	\
		gossip_debug(GOSSIP_DEV_DEBUG,	\
			"(get) Alloced OP (%p:%llu)\n",	\
			op,				\
			llu((op)->tag));		\
	} while (0)

#define put_op(op)							\
	do {								\
		if (atomic_sub_and_test(1, &(op)->ref_count) == 1) {  \
			gossip_debug(GOSSIP_DEV_DEBUG,		\
				"(put) Releasing OP (%p:%llu)\n",	\
				op,					\
				llu((op)->tag));			\
			op_release(op);					\
			}						\
	} while (0)

#define op_wait(op) (atomic_read(&(op)->ref_count) <= 2 ? 0 : 1)

/*
 * Defines for controlling whether I/O upcalls are for async or sync operations
 */
@@ -258,6 +238,25 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
#define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR)
#define op_state_serviced(op)    ((op)->op_state & OP_VFS_STATE_SERVICED)
#define op_state_purged(op)      ((op)->op_state & OP_VFS_STATE_PURGED)
#define op_state_given_up(op)    ((op)->op_state & OP_VFS_STATE_GIVEN_UP)

static inline void get_op(struct orangefs_kernel_op_s *op)
{
	atomic_inc(&op->ref_count);
	gossip_debug(GOSSIP_DEV_DEBUG,
			"(get) Alloced OP (%p:%llu)\n",	op, llu(op->tag));
}

void __op_release(struct orangefs_kernel_op_s *op);

static inline void op_release(struct orangefs_kernel_op_s *op)
{
	if (atomic_dec_and_test(&op->ref_count)) {
		gossip_debug(GOSSIP_DEV_DEBUG,
			"(put) Releasing OP (%p:%llu)\n", op, llu((op)->tag));
		__op_release(op);
	}
}

/* per inode private orangefs info */
struct orangefs_inode_s {
@@ -459,7 +458,6 @@ int op_cache_initialize(void);
int op_cache_finalize(void);
struct orangefs_kernel_op_s *op_alloc(__s32 type);
char *get_opname_string(struct orangefs_kernel_op_s *new_op);
void op_release(struct orangefs_kernel_op_s *op);

int dev_req_cache_initialize(void);
int dev_req_cache_finalize(void);
@@ -665,11 +663,9 @@ int service_operation(struct orangefs_kernel_op_s *op,
do {								\
	if (!op_state_serviced(new_op)) {			\
		orangefs_cancel_op_in_progress(new_op->tag);	\
		op_release(new_op);				\
	} else {						\
		wake_up_daemon_for_return(new_op);		\
	}							\
	new_op = NULL;						\
	orangefs_bufmap_put(bufmap, buffer_index);				\
	buffer_index = -1;					\
} while (0)
+2 −0
Original line number Diff line number Diff line
@@ -271,6 +271,7 @@ void purge_inprogress_ops(void)
		struct orangefs_kernel_op_s *op;
		struct orangefs_kernel_op_s *next;

		spin_lock(&htable_ops_in_progress_lock);
		list_for_each_entry_safe(op,
					 next,
					 &htable_ops_in_progress[i],
@@ -284,6 +285,7 @@ void purge_inprogress_ops(void)
			set_op_state_purged(op);
			spin_unlock(&op->lock);
		}
		spin_unlock(&htable_ops_in_progress_lock);
	}
}

Loading