Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 922dab61 authored by Ilya Dryomov's avatar Ilya Dryomov
Browse files

libceph, rbd: ceph_osd_linger_request, watch/notify v2



This adds support and switches rbd to a new, more reliable version of
watch/notify protocol.  As with the OSD client update, this is mostly
about getting the right structures linked into the right places so that
reconnects are properly sent when needed.  watch/notify v2 also
requires sending regular pings to the OSDs - send_linger_ping().

A major change from the old watch/notify implementation is the
introduction of ceph_osd_linger_request - linger requests no longer
piggy back on ceph_osd_request.  ceph_osd_event has been merged into
ceph_osd_linger_request.

All the details are now hidden within libceph, the interface consists
of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack().
ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep
the lifetime management simple.

ceph_osdc_notify_ack() accepts an optional data payload, which is
relayed back to the notifier.

Portions of this patch are loosely based on work by Douglas Fuller
<dfuller@redhat.com> and Mike Christie <michaelc@cs.wisc.edu>.

Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent c525f036
Loading
Loading
Loading
Loading
+41 −138
Original line number Diff line number Diff line
@@ -351,11 +351,11 @@ struct rbd_device {
	struct rbd_options	*opts;

	struct ceph_object_id	header_oid;
	struct ceph_object_locator header_oloc;

	struct ceph_file_layout	layout;

	struct ceph_osd_event   *watch_event;
	struct rbd_obj_request	*watch_request;
	struct ceph_osd_linger_request *watch_handle;

	struct rbd_spec		*parent_spec;
	u64			parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
	return __rbd_obj_request_wait(obj_request, 0);
}

static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
					unsigned long timeout)
{
	return __rbd_obj_request_wait(obj_request, timeout);
}

static void rbd_img_request_complete(struct rbd_img_request *img_request)
{

@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
		complete_all(&obj_request->completion);
}

static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
{
	dout("%s: obj %p\n", __func__, obj_request);
	obj_request_done_set(obj_request);
}

static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
{
	struct rbd_img_request *img_request = NULL;
@@ -1877,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
	case CEPH_OSD_OP_CALL:
		rbd_osd_call_callback(obj_request);
		break;
	case CEPH_OSD_OP_NOTIFY_ACK:
	case CEPH_OSD_OP_WATCH:
		rbd_osd_trivial_callback(obj_request);
		break;
	default:
		rbd_warn(NULL, "%s: unsupported op %hu",
			obj_request->object_name, (unsigned short) opcode);
@@ -3100,45 +3084,18 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
	obj_request_done_set(obj_request);
}

static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
{
	struct rbd_obj_request *obj_request;
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	int ret;

	obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
							OBJ_REQUEST_NODATA);
	if (!obj_request)
		return -ENOMEM;

	ret = -ENOMEM;
	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
						  obj_request);
	if (!obj_request->osd_req)
		goto out;

	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
					notify_id, 0, 0);
	rbd_osd_req_format_read(obj_request);

	ret = rbd_obj_request_submit(osdc, obj_request);
	if (ret)
		goto out;
	ret = rbd_obj_request_wait(obj_request);
out:
	rbd_obj_request_put(obj_request);

	return ret;
}
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);

static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
			 u64 notifier_id, void *data, size_t data_len)
{
	struct rbd_device *rbd_dev = (struct rbd_device *)data;
	struct rbd_device *rbd_dev = arg;
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	int ret;

	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
		rbd_dev->header_oid.name, (unsigned long long)notify_id,
		(unsigned int)opcode);
	dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
	     cookie, notify_id);

	/*
	 * Until adequate refresh error handling is in place, there is
@@ -3150,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
	if (ret)
		rbd_warn(rbd_dev, "refresh failed: %d", ret);

	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
				   &rbd_dev->header_oloc, notify_id, cookie,
				   NULL, 0);
	if (ret)
		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
}

/*
 * Send a (un)watch request and wait for the ack.  Return a request
 * with a ref held on success or error.
 */
static struct rbd_obj_request *rbd_obj_watch_request_helper(
						struct rbd_device *rbd_dev,
						bool watch)
static void rbd_watch_errcb(void *arg, u64 cookie, int err)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	struct ceph_options *opts = osdc->client->options;
	struct rbd_obj_request *obj_request;
	struct rbd_device *rbd_dev = arg;
	int ret;

	obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
					     OBJ_REQUEST_NODATA);
	if (!obj_request)
		return ERR_PTR(-ENOMEM);
	rbd_warn(rbd_dev, "encountered watch error: %d", err);

	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
						  obj_request);
	if (!obj_request->osd_req) {
		ret = -ENOMEM;
		goto out;
	}

	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
			      rbd_dev->watch_event->cookie, 0, watch);
	rbd_osd_req_format_write(obj_request);

	if (watch)
		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);

	ret = rbd_obj_request_submit(osdc, obj_request);
	if (ret)
		goto out;

	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
	if (ret)
		goto out;
	__rbd_dev_header_unwatch_sync(rbd_dev);

	ret = obj_request->result;
	ret = rbd_dev_header_watch_sync(rbd_dev);
	if (ret) {
		if (watch)
			rbd_obj_request_end(obj_request);
		goto out;
		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
		return;
	}

	return obj_request;

out:
	rbd_obj_request_put(obj_request);
	return ERR_PTR(ret);
	ret = rbd_dev_refresh(rbd_dev);
	if (ret)
		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
}

/*
@@ -3215,57 +3140,33 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	struct rbd_obj_request *obj_request;
	int ret;
	struct ceph_osd_linger_request *handle;

	rbd_assert(!rbd_dev->watch_event);
	rbd_assert(!rbd_dev->watch_request);
	rbd_assert(!rbd_dev->watch_handle);

	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
				     &rbd_dev->watch_event);
	if (ret < 0)
		return ret;

	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
	if (IS_ERR(obj_request)) {
		ceph_osdc_cancel_event(rbd_dev->watch_event);
		rbd_dev->watch_event = NULL;
		return PTR_ERR(obj_request);
	}

	/*
	 * A watch request is set to linger, so the underlying osd
	 * request won't go away until we unregister it.  We retain
	 * a pointer to the object request during that time (in
	 * rbd_dev->watch_request), so we'll keep a reference to it.
	 * We'll drop that reference after we've unregistered it in
	 * rbd_dev_header_unwatch_sync().
	 */
	rbd_dev->watch_request = obj_request;
	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
				 &rbd_dev->header_oloc, rbd_watch_cb,
				 rbd_watch_errcb, rbd_dev);
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	rbd_dev->watch_handle = handle;
	return 0;
}

static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
{
	struct rbd_obj_request *obj_request;

	rbd_assert(rbd_dev->watch_event);
	rbd_assert(rbd_dev->watch_request);
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	int ret;

	rbd_obj_request_end(rbd_dev->watch_request);
	rbd_obj_request_put(rbd_dev->watch_request);
	rbd_dev->watch_request = NULL;
	if (!rbd_dev->watch_handle)
		return;

	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
	if (!IS_ERR(obj_request))
		rbd_obj_request_put(obj_request);
	else
		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
			 PTR_ERR(obj_request));
	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
	if (ret)
		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);

	ceph_osdc_cancel_event(rbd_dev->watch_event);
	rbd_dev->watch_event = NULL;
	rbd_dev->watch_handle = NULL;
}

/*
@@ -4081,6 +3982,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
	init_rwsem(&rbd_dev->header_rwsem);

	ceph_oid_init(&rbd_dev->header_oid);
	ceph_oloc_init(&rbd_dev->header_oloc);

	rbd_dev->dev.bus = &rbd_bus_type;
	rbd_dev->dev.type = &rbd_device_type;
@@ -5285,6 +5187,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)

	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));

	rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
	if (rbd_dev->image_format == 1)
		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
				       spec->image_name, RBD_SUFFIX);
+3 −2
Original line number Diff line number Diff line
@@ -153,8 +153,9 @@ struct ceph_dir_layout {

/* watch-notify operations */
enum {
  WATCH_NOTIFY				= 1, /* notifying watcher */
  WATCH_NOTIFY_COMPLETE			= 2, /* notifier notified when done */
	CEPH_WATCH_EVENT_NOTIFY		  = 1, /* notifying watcher */
	CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
	CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
};


+58 −39
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ struct ceph_osd {
	struct rb_node o_node;
	struct ceph_connection o_con;
	struct rb_root o_requests;
	struct list_head o_linger_requests;
	struct rb_root o_linger_requests;
	struct list_head o_osd_lru;
	struct ceph_auth_handshake o_auth;
	unsigned long lru_ttl;
@@ -108,11 +108,12 @@ struct ceph_osd_req_op {
		} cls;
		struct {
			u64 cookie;
			u64 ver;
			u32 prot_ver;
			u32 timeout;
			__u8 flag;
			__u8 op;           /* CEPH_OSD_WATCH_OP_ */
			u32 gen;
		} watch;
		struct {
			struct ceph_osd_data request_data;
		} notify_ack;
		struct {
			u64 expected_object_size;
			u64 expected_write_size;
@@ -145,8 +146,6 @@ struct ceph_osd_request_target {
struct ceph_osd_request {
	u64             r_tid;              /* unique for this client */
	struct rb_node  r_node;
	struct list_head r_linger_item;
	struct list_head r_linger_osd_item;
	struct ceph_osd *r_osd;

	struct ceph_osd_request_target r_t;
@@ -162,7 +161,6 @@ struct ceph_osd_request {

	int               r_result;
	bool              r_got_reply;
	int		  r_linger;

	struct ceph_osd_client *r_osdc;
	struct kref       r_kref;
@@ -181,6 +179,7 @@ struct ceph_osd_request {
	struct ceph_snap_context *r_snapc;    /* for writes */
	struct timespec r_mtime;              /* ditto */
	u64 r_data_offset;                    /* ditto */
	bool r_linger;                        /* don't resend on failure */

	/* internal */
	unsigned long r_stamp;                /* jiffies, send or check time */
@@ -195,23 +194,40 @@ struct ceph_request_redirect {
	struct ceph_object_locator oloc;
};

struct ceph_osd_event {
	u64 cookie;
	int one_shot;
typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
				 u64 notifier_id, void *data, size_t data_len);
typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);

struct ceph_osd_linger_request {
	struct ceph_osd_client *osdc;
	void (*cb)(u64, u64, u8, void *);
	void *data;
	struct rb_node node;
	struct list_head osd_node;
	u64 linger_id;
	bool committed;

	struct ceph_osd *osd;
	struct ceph_osd_request *reg_req;
	struct ceph_osd_request *ping_req;
	unsigned long ping_sent;

	struct ceph_osd_request_target t;
	u32 last_force_resend;

	struct timespec mtime;

	struct kref kref;
};
	struct mutex lock;
	struct rb_node node;            /* osd */
	struct rb_node osdc_node;       /* osdc */
	struct list_head scan_item;

	struct completion reg_commit_wait;
	int reg_commit_error;
	int last_error;

struct ceph_osd_event_work {
	struct work_struct work;
	struct ceph_osd_event *event;
        u64 ver;
        u64 notify_id;
        u8 opcode;
	u32 register_gen;

	rados_watchcb2_t wcb;
	rados_watcherrcb_t errcb;
	void *data;
};

struct ceph_osd_client {
@@ -223,9 +239,10 @@ struct ceph_osd_client {
	struct rb_root         osds;          /* osds */
	struct list_head       osd_lru;       /* idle osds */
	spinlock_t             osd_lru_lock;
	struct list_head       req_linger;    /* lingering requests */
	struct ceph_osd        homeless_osd;
	atomic64_t             last_tid;      /* tid of last request */
	u64                    last_linger_id;
	struct rb_root         linger_requests; /* lingering requests */
	atomic_t               num_requests;
	atomic_t               num_homeless;
	struct delayed_work    timeout_work;
@@ -239,10 +256,6 @@ struct ceph_osd_client {
	struct ceph_msgpool	msgpool_op;
	struct ceph_msgpool	msgpool_op_reply;

	spinlock_t		event_lock;
	struct rb_root		event_tree;
	u64			event_count;

	struct workqueue_struct	*notify_wq;
};

@@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
				 u16 opcode, const char *name, const void *value,
				 size_t size, u8 cmp_op, u8 cmp_mode);
extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
					unsigned int which, u16 opcode,
					u64 cookie, u64 version, int flag);
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
				       unsigned int which,
				       u64 expected_object_size,
@@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
				      u32 truncate_seq, u64 truncate_size,
				      bool use_mempool);

extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
					 struct ceph_osd_request *req);

extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);

@@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
				struct timespec *mtime,
				struct page **pages, int nr_pages);

/* watch/notify events */
extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
				  void (*event_cb)(u64, u64, u8, void *),
				  void *data, struct ceph_osd_event **pevent);
extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
extern void ceph_osdc_put_event(struct ceph_osd_event *event);
/* watch/notify */
struct ceph_osd_linger_request *
ceph_osdc_watch(struct ceph_osd_client *osdc,
		struct ceph_object_id *oid,
		struct ceph_object_locator *oloc,
		rados_watchcb2_t wcb,
		rados_watcherrcb_t errcb,
		void *data);
int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
		      struct ceph_osd_linger_request *lreq);

int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
			 struct ceph_object_id *oid,
			 struct ceph_object_locator *oloc,
			 u64 notify_id,
			 u64 cookie,
			 void *payload,
			 size_t payload_len);
#endif
+14 −3
Original line number Diff line number Diff line
@@ -427,7 +427,17 @@ enum {
	CEPH_OSD_CMPXATTR_MODE_U64    = 2
};

#define RADOS_NOTIFY_VER	1
enum {
	CEPH_OSD_WATCH_OP_UNWATCH = 0,
	CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
	/* note: use only ODD ids to prevent pre-giant code from
	   interpreting the op as UNWATCH */
	CEPH_OSD_WATCH_OP_WATCH = 3,
	CEPH_OSD_WATCH_OP_RECONNECT = 5,
	CEPH_OSD_WATCH_OP_PING = 7,
};

const char *ceph_osd_watch_op_name(int o);

/*
 * an individual object operation.  each may be accompanied by some data
@@ -462,8 +472,9 @@ struct ceph_osd_op {
	        } __attribute__ ((packed)) snap;
		struct {
			__le64 cookie;
			__le64 ver;
			__u8 flag;	/* 0 = unwatch, 1 = watch */
			__le64 ver;     /* no longer used */
			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
			__le32 gen;     /* registration generation */
		} __attribute__ ((packed)) watch;
		struct {
			__le64 offset, length;
+16 −0
Original line number Diff line number Diff line
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
	}
}

const char *ceph_osd_watch_op_name(int o)
{
	switch (o) {
	case CEPH_OSD_WATCH_OP_UNWATCH:
		return "unwatch";
	case CEPH_OSD_WATCH_OP_WATCH:
		return "watch";
	case CEPH_OSD_WATCH_OP_RECONNECT:
		return "reconnect";
	case CEPH_OSD_WATCH_OP_PING:
		return "ping";
	default:
		return "???";
	}
}

const char *ceph_osd_state_name(int s)
{
	switch (s) {
Loading