Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 26c5eaa1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-4.12-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The two main items are support for disabling automatic rbd exclusive
  lock transfers from myself and the long awaited -ENOSPC handling
  series from Jeff.

  The former will allow rbd users to take advantage of exclusive lock's
  built-in blacklist/break-lock functionality while staying in control
  of who owns the lock. With the latter in place, we will abort
  filesystem writes on -ENOSPC instead of having them block
  indefinitely.

  Beyond that we've got the usual pile of filesystem fixes from Zheng,
  some refcount_t conversion patches from Elena and a patch for an
  ancient open() flags handling bug from Alexander"

* tag 'ceph-for-4.12-rc1' of git://github.com/ceph/ceph-client: (31 commits)
  ceph: fix memory leak in __ceph_setxattr()
  ceph: fix file open flags on ppc64
  ceph: choose readdir frag based on previous readdir reply
  rbd: exclusive map option
  rbd: return ResponseMessage result from rbd_handle_request_lock()
  rbd: kill rbd_is_lock_supported()
  rbd: support updating the lock cookie without releasing the lock
  rbd: store lock cookie
  rbd: ignore unlock errors
  rbd: fix error handling around rbd_init_disk()
  rbd: move rbd_unregister_watch() call into rbd_dev_image_release()
  rbd: move rbd_dev_destroy() call out of rbd_dev_image_release()
  ceph: when seeing write errors on an inode, switch to sync writes
  Revert "ceph: SetPageError() for writeback pages if writepages fails"
  ceph: handle epoch barriers in cap messages
  libceph: add an epoch_barrier field to struct ceph_osd_client
  libceph: abort already submitted but abortable requests when map or pool goes full
  libceph: allow requests to return immediately on full conditions if caller wishes
  libceph: remove req->r_replay_version
  ceph: make seeky readdir more efficient
  ...
parents 1176032c eeca958d
Loading
Loading
Loading
Loading
+215 −144
Original line number Diff line number Diff line
@@ -387,6 +387,7 @@ struct rbd_device {

	struct rw_semaphore	lock_rwsem;
	enum rbd_lock_state	lock_state;
	char			lock_cookie[32];
	struct rbd_client_id	owner_cid;
	struct work_struct	acquired_lock_work;
	struct work_struct	released_lock_work;
@@ -477,13 +478,6 @@ static int minor_to_rbd_dev_id(int minor)
	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
}

static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
{
	return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
	       rbd_dev->spec->snap_id == CEPH_NOSNAP &&
	       !rbd_dev->mapping.read_only;
}

static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
{
	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
@@ -731,7 +725,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
	kref_init(&rbdc->kref);
	INIT_LIST_HEAD(&rbdc->node);

	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
	rbdc->client = ceph_create_client(ceph_opts, rbdc);
	if (IS_ERR(rbdc->client))
		goto out_rbdc;
	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
@@ -804,6 +798,7 @@ enum {
	Opt_read_only,
	Opt_read_write,
	Opt_lock_on_read,
	Opt_exclusive,
	Opt_err
};

@@ -816,6 +811,7 @@ static match_table_t rbd_opts_tokens = {
	{Opt_read_write, "read_write"},
	{Opt_read_write, "rw"},		/* Alternate spelling */
	{Opt_lock_on_read, "lock_on_read"},
	{Opt_exclusive, "exclusive"},
	{Opt_err, NULL}
};

@@ -823,11 +819,13 @@ struct rbd_options {
	int	queue_depth;
	bool	read_only;
	bool	lock_on_read;
	bool	exclusive;
};

#define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
#define RBD_READ_ONLY_DEFAULT	false
#define RBD_LOCK_ON_READ_DEFAULT false
#define RBD_EXCLUSIVE_DEFAULT	false

static int parse_rbd_opts_token(char *c, void *private)
{
@@ -866,6 +864,9 @@ static int parse_rbd_opts_token(char *c, void *private)
	case Opt_lock_on_read:
		rbd_opts->lock_on_read = true;
		break;
	case Opt_exclusive:
		rbd_opts->exclusive = true;
		break;
	default:
		/* libceph prints "bad option" msg */
		return -EINVAL;
@@ -3079,7 +3080,8 @@ static int rbd_lock(struct rbd_device *rbd_dev)
	char cookie[32];
	int ret;

	WARN_ON(__rbd_is_lock_owner(rbd_dev));
	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
		rbd_dev->lock_cookie[0] != '\0');

	format_lock_cookie(rbd_dev, cookie);
	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
@@ -3089,6 +3091,7 @@ static int rbd_lock(struct rbd_device *rbd_dev)
		return ret;

	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
	strcpy(rbd_dev->lock_cookie, cookie);
	rbd_set_owner_cid(rbd_dev, &cid);
	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
	return 0;
@@ -3097,27 +3100,24 @@ static int rbd_lock(struct rbd_device *rbd_dev)
/*
 * lock_rwsem must be held for write
 */
static int rbd_unlock(struct rbd_device *rbd_dev)
static void rbd_unlock(struct rbd_device *rbd_dev)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	char cookie[32];
	int ret;

	WARN_ON(!__rbd_is_lock_owner(rbd_dev));

	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
		rbd_dev->lock_cookie[0] == '\0');

	format_lock_cookie(rbd_dev, cookie);
	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
			      RBD_LOCK_NAME, cookie);
	if (ret && ret != -ENOENT) {
		rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
		return ret;
	}
			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
	if (ret && ret != -ENOENT)
		rbd_warn(rbd_dev, "failed to unlock: %d", ret);

	/* treat errors as the image is unlocked */
	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
	rbd_dev->lock_cookie[0] = '\0';
	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
	return 0;
}

static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
@@ -3447,6 +3447,18 @@ static void rbd_acquire_lock(struct work_struct *work)
	ret = rbd_request_lock(rbd_dev);
	if (ret == -ETIMEDOUT) {
		goto again; /* treat this as a dead client */
	} else if (ret == -EROFS) {
		rbd_warn(rbd_dev, "peer will not release lock");
		/*
		 * If this is rbd_add_acquire_lock(), we want to fail
		 * immediately -- reuse BLACKLISTED flag.  Otherwise we
		 * want to block.
		 */
		if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
			/* wake "rbd map --exclusive" process */
			wake_requests(rbd_dev, false);
		}
	} else if (ret < 0) {
		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
@@ -3490,7 +3502,7 @@ static bool rbd_release_lock(struct rbd_device *rbd_dev)
	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
		return false;

	if (!rbd_unlock(rbd_dev))
	rbd_unlock(rbd_dev);
	/*
	 * Give others a chance to grab the lock - we would re-acquire
	 * almost immediately if we got new IO during ceph_osdc_sync()
@@ -3499,7 +3511,6 @@ static bool rbd_release_lock(struct rbd_device *rbd_dev)
	 * after wake_requests() in rbd_handle_released_lock().
	 */
	cancel_delayed_work(&rbd_dev->lock_dwork);

	return true;
}

@@ -3580,12 +3591,16 @@ static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
	up_read(&rbd_dev->lock_rwsem);
}

static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
/*
 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
 * ResponseMessage is needed.
 */
static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
				   void **p)
{
	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
	struct rbd_client_id cid = { 0 };
	bool need_to_send;
	int result = 1;

	if (struct_v >= 2) {
		cid.gid = ceph_decode_64(p);
@@ -3595,19 +3610,36 @@ static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
	     cid.handle);
	if (rbd_cid_equal(&cid, &my_cid))
		return false;
		return result;

	down_read(&rbd_dev->lock_rwsem);
	need_to_send = __rbd_is_lock_owner(rbd_dev);
	if (__rbd_is_lock_owner(rbd_dev)) {
		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
			goto out_unlock;

		/*
		 * encode ResponseMessage(0) so the peer can detect
		 * a missing owner
		 */
		result = 0;

		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
		if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
			dout("%s rbd_dev %p queueing unlock_work\n", __func__,
			     rbd_dev);
			queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
			if (!rbd_dev->opts->exclusive) {
				dout("%s rbd_dev %p queueing unlock_work\n",
				     __func__, rbd_dev);
				queue_work(rbd_dev->task_wq,
					   &rbd_dev->unlock_work);
			} else {
				/* refuse to release the lock */
				result = -EROFS;
			}
		}
	}

out_unlock:
	up_read(&rbd_dev->lock_rwsem);
	return need_to_send;
	return result;
}

static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
@@ -3690,13 +3722,10 @@ static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
		break;
	case RBD_NOTIFY_OP_REQUEST_LOCK:
		if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
			/*
			 * send ResponseMessage(0) back so the client
			 * can detect a missing owner
			 */
		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
		if (ret <= 0)
			rbd_acknowledge_notify_result(rbd_dev, notify_id,
						      cookie, 0);
						      cookie, ret);
		else
			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
		break;
@@ -3821,24 +3850,51 @@ static void rbd_unregister_watch(struct rbd_device *rbd_dev)
	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
}

/*
 * lock_rwsem must be held for write
 */
static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	char cookie[32];
	int ret;

	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);

	format_lock_cookie(rbd_dev, cookie);
	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
				  RBD_LOCK_TAG, cookie);
	if (ret) {
		if (ret != -EOPNOTSUPP)
			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
				 ret);

		/*
		 * Lock cookie cannot be updated on older OSDs, so do
		 * a manual release and queue an acquire.
		 */
		if (rbd_release_lock(rbd_dev))
			queue_delayed_work(rbd_dev->task_wq,
					   &rbd_dev->lock_dwork, 0);
	} else {
		strcpy(rbd_dev->lock_cookie, cookie);
	}
}

static void rbd_reregister_watch(struct work_struct *work)
{
	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
					    struct rbd_device, watch_dwork);
	bool was_lock_owner = false;
	bool need_to_wake = false;
	int ret;

	dout("%s rbd_dev %p\n", __func__, rbd_dev);

	down_write(&rbd_dev->lock_rwsem);
	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
		was_lock_owner = rbd_release_lock(rbd_dev);

	mutex_lock(&rbd_dev->watch_mutex);
	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
		mutex_unlock(&rbd_dev->watch_mutex);
		goto out;
		return;
	}

	ret = __rbd_register_watch(rbd_dev);
@@ -3846,36 +3902,28 @@ static void rbd_reregister_watch(struct work_struct *work)
		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
		if (ret == -EBLACKLISTED || ret == -ENOENT) {
			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
			need_to_wake = true;
			wake_requests(rbd_dev, true);
		} else {
			queue_delayed_work(rbd_dev->task_wq,
					   &rbd_dev->watch_dwork,
					   RBD_RETRY_DELAY);
		}
		mutex_unlock(&rbd_dev->watch_mutex);
		goto out;
		return;
	}

	need_to_wake = true;
	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
	mutex_unlock(&rbd_dev->watch_mutex);

	down_write(&rbd_dev->lock_rwsem);
	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
		rbd_reacquire_lock(rbd_dev);
	up_write(&rbd_dev->lock_rwsem);

	ret = rbd_dev_refresh(rbd_dev);
	if (ret)
		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);

	if (was_lock_owner) {
		ret = rbd_try_lock(rbd_dev);
		if (ret)
			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
				 ret);
	}

out:
	up_write(&rbd_dev->lock_rwsem);
	if (need_to_wake)
		wake_requests(rbd_dev, true);
}

/*
@@ -4034,10 +4082,6 @@ static void rbd_queue_workfn(struct work_struct *work)
	if (op_type != OBJ_OP_READ) {
		snapc = rbd_dev->header.snapc;
		ceph_get_snap_context(snapc);
		must_be_locked = rbd_is_lock_supported(rbd_dev);
	} else {
		must_be_locked = rbd_dev->opts->lock_on_read &&
					rbd_is_lock_supported(rbd_dev);
	}
	up_read(&rbd_dev->header_rwsem);

@@ -4048,14 +4092,20 @@ static void rbd_queue_workfn(struct work_struct *work)
		goto err_rq;
	}

	must_be_locked =
	    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
	if (must_be_locked) {
		down_read(&rbd_dev->lock_rwsem);
		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
			if (rbd_dev->opts->exclusive) {
				rbd_warn(rbd_dev, "exclusive lock required");
				result = -EROFS;
				goto err_unlock;
			}
			rbd_wait_state_locked(rbd_dev);

		WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^
			!test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
		}
		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
			result = -EBLACKLISTED;
			goto err_unlock;
@@ -4114,19 +4164,10 @@ static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,

static void rbd_free_disk(struct rbd_device *rbd_dev)
{
	struct gendisk *disk = rbd_dev->disk;

	if (!disk)
		return;

	rbd_dev->disk = NULL;
	if (disk->flags & GENHD_FL_UP) {
		del_gendisk(disk);
		if (disk->queue)
			blk_cleanup_queue(disk->queue);
	blk_cleanup_queue(rbd_dev->disk->queue);
	blk_mq_free_tag_set(&rbd_dev->tag_set);
	}
	put_disk(disk);
	put_disk(rbd_dev->disk);
	rbd_dev->disk = NULL;
}

static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
@@ -4383,8 +4424,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;

	/*
	 * disk_release() expects a queue ref from add_disk() and will
	 * put it.  Hold an extra ref until add_disk() is called.
	 */
	WARN_ON(!blk_get_queue(q));
	disk->queue = q;

	q->queuedata = rbd_dev;

	rbd_dev->disk = disk;
@@ -5624,6 +5669,7 @@ static int rbd_add_parse_args(const char *buf,
	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
	rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;

	copts = ceph_parse_options(options, mon_addrs,
					mon_addrs + mon_addrs_size - 1,
@@ -5682,6 +5728,33 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
	return ret;
}

static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
{
	down_write(&rbd_dev->lock_rwsem);
	if (__rbd_is_lock_owner(rbd_dev))
		rbd_unlock(rbd_dev);
	up_write(&rbd_dev->lock_rwsem);
}

static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
{
	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
		return -EINVAL;
	}

	/* FIXME: "rbd map --exclusive" should be in interruptible */
	down_read(&rbd_dev->lock_rwsem);
	rbd_wait_state_locked(rbd_dev);
	up_read(&rbd_dev->lock_rwsem);
	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
		rbd_warn(rbd_dev, "failed to acquire exclusive lock");
		return -EROFS;
	}

	return 0;
}

/*
 * An rbd format 2 image has a unique identifier, distinct from the
 * name given to it by the user.  Internally, that identifier is
@@ -5873,6 +5946,15 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
	return ret;
}

static void rbd_dev_device_release(struct rbd_device *rbd_dev)
{
	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
	rbd_dev_mapping_clear(rbd_dev);
	rbd_free_disk(rbd_dev);
	if (!single_major)
		unregister_blkdev(rbd_dev->major, rbd_dev->name);
}

/*
 * rbd_dev->header_rwsem must be locked for write and will be unlocked
 * upon return.
@@ -5908,26 +5990,13 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);

	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
	ret = device_add(&rbd_dev->dev);
	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
	if (ret)
		goto err_out_mapping;

	/* Everything's ready.  Announce the disk to the world. */

	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
	up_write(&rbd_dev->header_rwsem);

	spin_lock(&rbd_dev_list_lock);
	list_add_tail(&rbd_dev->node, &rbd_dev_list);
	spin_unlock(&rbd_dev_list_lock);

	add_disk(rbd_dev->disk);
	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
		rbd_dev->header.features);

	return ret;
	return 0;

err_out_mapping:
	rbd_dev_mapping_clear(rbd_dev);
@@ -5962,11 +6031,11 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
static void rbd_dev_image_release(struct rbd_device *rbd_dev)
{
	rbd_dev_unprobe(rbd_dev);
	if (rbd_dev->opts)
		rbd_unregister_watch(rbd_dev);
	rbd_dev->image_format = 0;
	kfree(rbd_dev->spec->image_id);
	rbd_dev->spec->image_id = NULL;

	rbd_dev_destroy(rbd_dev);
}

/*
@@ -6126,22 +6195,43 @@ static ssize_t do_rbd_add(struct bus_type *bus,
	rbd_dev->mapping.read_only = read_only;

	rc = rbd_dev_device_setup(rbd_dev);
	if (rc) {
		/*
		 * rbd_unregister_watch() can't be moved into
		 * rbd_dev_image_release() without refactoring, see
		 * commit 1f3ef78861ac.
		 */
		rbd_unregister_watch(rbd_dev);
		rbd_dev_image_release(rbd_dev);
		goto out;
	if (rc)
		goto err_out_image_probe;

	if (rbd_dev->opts->exclusive) {
		rc = rbd_add_acquire_lock(rbd_dev);
		if (rc)
			goto err_out_device_setup;
	}

	/* Everything's ready.  Announce the disk to the world. */

	rc = device_add(&rbd_dev->dev);
	if (rc)
		goto err_out_image_lock;

	add_disk(rbd_dev->disk);
	/* see rbd_init_disk() */
	blk_put_queue(rbd_dev->disk->queue);

	spin_lock(&rbd_dev_list_lock);
	list_add_tail(&rbd_dev->node, &rbd_dev_list);
	spin_unlock(&rbd_dev_list_lock);

	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
		rbd_dev->header.features);
	rc = count;
out:
	module_put(THIS_MODULE);
	return rc;

err_out_image_lock:
	rbd_dev_image_unlock(rbd_dev);
err_out_device_setup:
	rbd_dev_device_release(rbd_dev);
err_out_image_probe:
	rbd_dev_image_release(rbd_dev);
err_out_rbd_dev:
	rbd_dev_destroy(rbd_dev);
err_out_client:
@@ -6169,21 +6259,6 @@ static ssize_t rbd_add_single_major(struct bus_type *bus,
	return do_rbd_add(bus, buf, count);
}

static void rbd_dev_device_release(struct rbd_device *rbd_dev)
{
	rbd_free_disk(rbd_dev);

	spin_lock(&rbd_dev_list_lock);
	list_del_init(&rbd_dev->node);
	spin_unlock(&rbd_dev_list_lock);

	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
	device_del(&rbd_dev->dev);
	rbd_dev_mapping_clear(rbd_dev);
	if (!single_major)
		unregister_blkdev(rbd_dev->major, rbd_dev->name);
}

static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
{
	while (rbd_dev->parent) {
@@ -6201,6 +6276,7 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
		}
		rbd_assert(second);
		rbd_dev_image_release(second);
		rbd_dev_destroy(second);
		first->parent = NULL;
		first->parent_overlap = 0;

@@ -6269,21 +6345,16 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
		blk_set_queue_dying(rbd_dev->disk->queue);
	}

	down_write(&rbd_dev->lock_rwsem);
	if (__rbd_is_lock_owner(rbd_dev))
		rbd_unlock(rbd_dev);
	up_write(&rbd_dev->lock_rwsem);
	rbd_unregister_watch(rbd_dev);
	del_gendisk(rbd_dev->disk);
	spin_lock(&rbd_dev_list_lock);
	list_del_init(&rbd_dev->node);
	spin_unlock(&rbd_dev_list_lock);
	device_del(&rbd_dev->dev);

	/*
	 * Don't free anything from rbd_dev->disk until after all
	 * notifies are completely processed. Otherwise
	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
	 * in a potential use after free of rbd_dev->disk or rbd_dev.
	 */
	rbd_dev_image_unlock(rbd_dev);
	rbd_dev_device_release(rbd_dev);
	rbd_dev_image_release(rbd_dev);

	rbd_dev_destroy(rbd_dev);
	return count;
}

+6 −4
Original line number Diff line number Diff line
@@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req)
	bool remove_page;

	dout("writepages_finish %p rc %d\n", inode, rc);
	if (rc < 0)
	if (rc < 0) {
		mapping_set_error(mapping, rc);
		ceph_set_error_write(ci);
	} else {
		ceph_clear_error_write(ci);
	}

	/*
	 * We lost the cache cap, need to truncate the page before
@@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req)
				clear_bdi_congested(inode_to_bdi(inode),
						    BLK_RW_ASYNC);

			if (rc < 0)
				SetPageError(page);

			ceph_put_snap_context(page_snap_context(page));
			page->private = 0;
			ClearPagePrivate(page);
@@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);

	wr_req->r_mtime = ci->vfs_inode.i_mtime;
	wr_req->r_abort_on_full = true;
	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);

	if (!err)
+18 −7
Original line number Diff line number Diff line
@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
	void *p;
	size_t extra_len;
	struct timespec zerotime = {0};
	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;

	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
	     " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
@@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg)
	ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
	/* inline data size */
	ceph_encode_32(&p, 0);
	/* osd_epoch_barrier (version 5) */
	ceph_encode_32(&p, 0);
	/*
	 * osd_epoch_barrier (version 5)
	 * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
	 * case it was recently changed
	 */
	ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
	/* oldest_flush_tid (version 6) */
	ceph_encode_64(&p, arg->oldest_flush_tid);

@@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
		first_tid = cf->tid + 1;

		capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
		atomic_inc(&capsnap->nref);
		refcount_inc(&capsnap->nref);
		spin_unlock(&ci->i_ceph_lock);

		dout("__flush_snaps %p capsnap %p tid %llu %s\n",
@@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
			     inode, capsnap, cf->tid,
			     ceph_cap_string(capsnap->dirty));

			atomic_inc(&capsnap->nref);
			refcount_inc(&capsnap->nref);
			spin_unlock(&ci->i_ceph_lock);

			ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
@@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
		p += inline_len;
	}

	if (le16_to_cpu(msg->hdr.version) >= 5) {
		struct ceph_osd_client	*osdc = &mdsc->fsc->client->osdc;
		u32			epoch_barrier;

		ceph_decode_32_safe(&p, end, epoch_barrier, bad);
		ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
	}

	if (le16_to_cpu(msg->hdr.version) >= 8) {
		u64 flush_tid;
		u32 caller_uid, caller_gid;
		u32 osd_epoch_barrier;
		u32 pool_ns_len;
		/* version >= 5 */
		ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);

		/* version >= 6 */
		ceph_decode_64_safe(&p, end, flush_tid, bad);
		/* version >= 7 */
+10 −11
Original line number Diff line number Diff line
@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
{
	int i;
	struct ceph_fs_client *fsc = s->private;
	struct ceph_mdsmap *mdsmap;

	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
		return 0;
	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
	seq_printf(s, "session_timeout %d\n",
		       fsc->mdsc->mdsmap->m_session_timeout);
	seq_printf(s, "session_autoclose %d\n",
		       fsc->mdsc->mdsmap->m_session_autoclose);
	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
		struct ceph_entity_addr *addr =
			&fsc->mdsc->mdsmap->m_info[i].addr;
		int state = fsc->mdsc->mdsmap->m_info[i].state;

	mdsmap = fsc->mdsc->mdsmap;
	seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
	seq_printf(s, "root %d\n", mdsmap->m_root);
	seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
	seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
	seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
	for (i = 0; i < mdsmap->m_num_mds; i++) {
		struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
		int state = mdsmap->m_info[i].state;
		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
			       ceph_pr_addr(&addr->in_addr),
			       ceph_mds_state_name(state));
+16 −7
Original line number Diff line number Diff line
@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
	struct ceph_mds_client *mdsc = fsc->mdsc;
	int i;
	int err;
	u32 ftype;
	unsigned frag = -1;
	struct ceph_mds_reply_info_parsed *rinfo;

	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
@@ -341,7 +341,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
	/* do we have the correct frag content buffered? */
	if (need_send_readdir(fi, ctx->pos)) {
		struct ceph_mds_request *req;
		unsigned frag;
		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;

@@ -352,6 +351,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
		}

		if (is_hash_order(ctx->pos)) {
			/* fragtree isn't always accurate. choose frag
			 * based on previous reply when possible. */
			if (frag == (unsigned)-1)
				frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
							NULL, NULL);
		} else {
@@ -378,7 +380,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
				ceph_mdsc_put_request(req);
				return -ENOMEM;
			}
		} else if (is_hash_order(ctx->pos)) {
			req->r_args.readdir.offset_hash =
				cpu_to_le32(fpos_hash(ctx->pos));
		}

		req->r_dir_release_cnt = fi->dir_release_count;
		req->r_dir_ordered_cnt = fi->dir_ordered_count;
		req->r_readdir_cache_idx = fi->readdir_cache_idx;
@@ -476,6 +482,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
		struct ceph_vino vino;
		ino_t ino;
		u32 ftype;

		BUG_ON(rde->offset < ctx->pos);

@@ -498,15 +505,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
		ctx->pos++;
	}

	if (fi->next_offset > 2) {
	ceph_mdsc_put_request(fi->last_readdir);
	fi->last_readdir = NULL;

	if (fi->next_offset > 2) {
		frag = fi->frag;
		goto more;
	}

	/* more frags? */
	if (!ceph_frag_is_rightmost(fi->frag)) {
		unsigned frag = ceph_frag_next(fi->frag);
		frag = ceph_frag_next(fi->frag);
		if (is_hash_order(ctx->pos)) {
			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
							fi->next_offset, true);
Loading