Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1204c464 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull Ceph updates from Sage Weil:
 "This time around we have a collection of CephFS fixes from Zheng
  around MDS failure handling and snapshots, support for a new CRUSH
  straw2 algorithm (to sync up with userspace) and several RBD cleanups
  and fixes from Ilya, an error path leak fix from Taesoo, and then an
  assorted collection of cleanups from others"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (28 commits)
  rbd: rbd_wq comment is obsolete
  libceph: announce support for straw2 buckets
  crush: straw2 bucket type with an efficient 64-bit crush_ln()
  crush: ensuring at most num-rep osds are selected
  crush: drop unnecessary include from mapper.c
  ceph: fix uninline data function
  ceph: rename snapshot support
  ceph: fix null pointer dereference in send_mds_reconnect()
  ceph: hold on to exclusive caps on complete directories
  libceph: simplify our debugfs attr macro
  ceph: show non-default options only
  libceph: expose client options through debugfs
  libceph, ceph: split ceph_show_options()
  rbd: mark block queue as non-rotational
  libceph: don't overwrite specific con error msgs
  ceph: cleanup unsafe requests when reconnecting is denied
  ceph: don't zero i_wrbuffer_ref when reconnecting is denied
  ceph: don't mark dirty caps when there is no auth cap
  ceph: keep i_snap_realm while there are writers
  libceph: osdmap.h: Add missing format newlines
  ...
parents 4f211235 f77303bd
Loading
Loading
Loading
Loading
+20 −6
Original line number Diff line number Diff line
@@ -3762,8 +3762,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
		goto out_tag_set;
	}

	/* We use the default size, but let's be explicit about it. */
	blk_queue_physical_block_size(q, SECTOR_SIZE);
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */

	/* set io sizes to object size */
	segment_size = rbd_obj_bytes(&rbd_dev->header);
@@ -5301,9 +5301,14 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)

	if (mapping) {
		ret = rbd_dev_header_watch_sync(rbd_dev);
		if (ret)
		if (ret) {
			if (ret == -ENOENT)
				pr_info("image %s/%s does not exist\n",
					rbd_dev->spec->pool_name,
					rbd_dev->spec->image_name);
			goto out_header_name;
		}
	}

	ret = rbd_dev_header_info(rbd_dev);
	if (ret)
@@ -5319,8 +5324,14 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
		ret = rbd_spec_fill_snap_id(rbd_dev);
	else
		ret = rbd_spec_fill_names(rbd_dev);
	if (ret)
	if (ret) {
		if (ret == -ENOENT)
			pr_info("snap %s/%s@%s does not exist\n",
				rbd_dev->spec->pool_name,
				rbd_dev->spec->image_name,
				rbd_dev->spec->snap_name);
		goto err_out_probe;
	}

	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
		ret = rbd_dev_v2_parent_info(rbd_dev);
@@ -5390,8 +5401,11 @@ static ssize_t do_rbd_add(struct bus_type *bus,

	/* pick the pool */
	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
	if (rc < 0)
	if (rc < 0) {
		if (rc == -ENOENT)
			pr_info("pool %s does not exist\n", spec->pool_name);
		goto err_out_client;
	}
	spec->pool_id = (u64)rc;

	/* The ceph file layout needs to fit pool id in 32 bits */
@@ -5673,7 +5687,7 @@ static int __init rbd_init(void)

	/*
	 * The number of active work items is limited by the number of
	 * rbd devices, so leave @max_active at default.
	 * rbd devices * queue depth, so leave @max_active at default.
	 */
	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
	if (!rbd_wq) {
+25 −13
Original line number Diff line number Diff line
@@ -1146,6 +1146,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
		     inode, page, (int)pos, (int)len);

		r = ceph_update_writeable_page(file, pos, len, page);
		if (r < 0)
			page_cache_release(page);
		else
			*pagep = page;
	} while (r == -EAGAIN);

	return r;
@@ -1534,19 +1538,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)

	osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);

	{
		__le64 xattr_buf = cpu_to_le64(inline_version);
		err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
				    "inline_version", &inline_version,
				    sizeof(inline_version),
					    "inline_version", &xattr_buf,
					    sizeof(xattr_buf),
					    CEPH_OSD_CMPXATTR_OP_GT,
					    CEPH_OSD_CMPXATTR_MODE_U64);
		if (err)
			goto out_put;
	}

	{
		char xattr_buf[32];
		int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
					 "%llu", inline_version);
		err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
				    "inline_version", &inline_version,
				    sizeof(inline_version), 0, 0);
					    "inline_version",
					    xattr_buf, xattr_len, 0, 0);
		if (err)
			goto out_put;
	}

	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+41 −10
Original line number Diff line number Diff line
@@ -896,6 +896,18 @@ int ceph_is_any_caps(struct inode *inode)
	return ret;
}

static void drop_inode_snap_realm(struct ceph_inode_info *ci)
{
	struct ceph_snap_realm *realm = ci->i_snap_realm;
	spin_lock(&realm->inodes_with_caps_lock);
	list_del_init(&ci->i_snap_realm_item);
	ci->i_snap_realm_counter++;
	ci->i_snap_realm = NULL;
	spin_unlock(&realm->inodes_with_caps_lock);
	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
			    realm);
}

/*
 * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
 *
@@ -946,15 +958,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
	if (removed)
		ceph_put_cap(mdsc, cap);

	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
		struct ceph_snap_realm *realm = ci->i_snap_realm;
		spin_lock(&realm->inodes_with_caps_lock);
		list_del_init(&ci->i_snap_realm_item);
		ci->i_snap_realm_counter++;
		ci->i_snap_realm = NULL;
		spin_unlock(&realm->inodes_with_caps_lock);
		ceph_put_snap_realm(mdsc, realm);
	}
	/* when reconnect denied, we remove session caps forcibly,
	 * i_wr_ref can be non-zero. If there are ongoing write,
	 * keep i_snap_realm.
	 */
	if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
		drop_inode_snap_realm(ci);

	if (!__ceph_is_any_real_caps(ci))
		__cap_delay_cancel(mdsc, ci);
}
@@ -1394,6 +1404,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
	int was = ci->i_dirty_caps;
	int dirty = 0;

	if (!ci->i_auth_cap) {
		pr_warn("__mark_dirty_caps %p %llx mask %s, "
			"but no auth cap (session was closed?)\n",
			inode, ceph_ino(inode), ceph_cap_string(mask));
		return 0;
	}

	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
	     ceph_cap_string(mask), ceph_cap_string(was),
	     ceph_cap_string(was | mask));
@@ -1404,7 +1421,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
				ci->i_snap_realm->cached_context);
		dout(" inode %p now dirty snapc %p auth cap %p\n",
		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
		WARN_ON(!ci->i_auth_cap);
		BUG_ON(!list_empty(&ci->i_dirty_item));
		spin_lock(&mdsc->cap_dirty_lock);
		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1545,7 +1561,19 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
	if (!mdsc->stopping && inode->i_nlink > 0) {
		if (want) {
			retain |= CEPH_CAP_ANY;       /* be greedy */
		} else if (S_ISDIR(inode->i_mode) &&
			   (issued & CEPH_CAP_FILE_SHARED) &&
			    __ceph_dir_is_complete(ci)) {
			/*
			 * If a directory is complete, we want to keep
			 * the exclusive cap. So that MDS does not end up
			 * revoking the shared cap on every create/unlink
			 * operation.
			 */
			want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
			retain |= want;
		} else {

			retain |= CEPH_CAP_ANY_SHARED;
			/*
			 * keep RD only if we didn't have the file open RW,
@@ -2309,6 +2337,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
					wake = 1;
				}
			}
			/* see comment in __ceph_remove_cap() */
			if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
				drop_inode_snap_realm(ci);
		}
	spin_unlock(&ci->i_ceph_lock);

+34 −14
Original line number Diff line number Diff line
@@ -281,6 +281,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
	/* can we use the dcache? */
	spin_lock(&ci->i_ceph_lock);
	if ((ctx->pos == 2 || fi->dentry) &&
	    ceph_test_mount_opt(fsc, DCACHE) &&
	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
	    ceph_snap(inode) != CEPH_SNAPDIR &&
	    __ceph_dir_is_complete_ordered(ci) &&
@@ -336,16 +337,23 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
			ceph_mdsc_put_request(req);
			return err;
		}
		req->r_inode = inode;
		ihold(inode);
		req->r_dentry = dget(file->f_path.dentry);
		/* hints to request -> mds selection code */
		req->r_direct_mode = USE_AUTH_MDS;
		req->r_direct_hash = ceph_frag_value(frag);
		req->r_direct_is_hash = true;
		if (fi->last_name) {
			req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
			if (!req->r_path2) {
				ceph_mdsc_put_request(req);
				return -ENOMEM;
			}
		}
		req->r_readdir_offset = fi->next_offset;
		req->r_args.readdir.frag = cpu_to_le32(frag);

		req->r_inode = inode;
		ihold(inode);
		req->r_dentry = dget(file->f_path.dentry);
		err = ceph_mdsc_do_request(mdsc, NULL, req);
		if (err < 0) {
			ceph_mdsc_put_request(req);
@@ -629,6 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
			    fsc->mount_options->snapdir_name,
			    dentry->d_name.len) &&
		    !is_root_ceph_dentry(dir, dentry) &&
		    ceph_test_mount_opt(fsc, DCACHE) &&
		    __ceph_dir_is_complete(ci) &&
		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
			spin_unlock(&ci->i_ceph_lock);
@@ -755,10 +764,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
		err = PTR_ERR(req);
		goto out;
	}
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
	req->r_path2 = kstrdup(dest, GFP_NOFS);
	if (!req->r_path2) {
		err = -ENOMEM;
		ceph_mdsc_put_request(req);
		goto out;
	}
	req->r_locked_dir = dir;
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
	err = ceph_mdsc_do_request(mdsc, dir, req);
@@ -933,16 +947,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
	struct ceph_mds_client *mdsc = fsc->mdsc;
	struct ceph_mds_request *req;
	int op = CEPH_MDS_OP_RENAME;
	int err;

	if (ceph_snap(old_dir) != ceph_snap(new_dir))
		return -EXDEV;
	if (ceph_snap(old_dir) != CEPH_NOSNAP ||
	    ceph_snap(new_dir) != CEPH_NOSNAP)
	if (ceph_snap(old_dir) != CEPH_NOSNAP) {
		if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
			op = CEPH_MDS_OP_RENAMESNAP;
		else
			return -EROFS;
	}
	dout("rename dir %p dentry %p to dir %p dentry %p\n",
	     old_dir, old_dentry, new_dir, new_dentry);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
	if (IS_ERR(req))
		return PTR_ERR(req);
	ihold(old_dir);
@@ -1240,11 +1258,12 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
		     inode, req->r_tid, last_tid);
		if (req->r_timeout) {
			ret = wait_for_completion_timeout(
				&req->r_safe_completion, req->r_timeout);
			if (ret > 0)
			unsigned long time_left = wait_for_completion_timeout(
							&req->r_safe_completion,
							req->r_timeout);
			if (time_left > 0)
				ret = 0;
			else if (ret == 0)
			else
				ret = -EIO;  /* timed out */
		} else {
			wait_for_completion(&req->r_safe_completion);
@@ -1372,6 +1391,7 @@ const struct inode_operations ceph_snapdir_iops = {
	.getattr = ceph_getattr,
	.mkdir = ceph_mkdir,
	.rmdir = ceph_unlink,
	.rename = ceph_rename,
};

const struct dentry_operations ceph_dentry_ops = {
+45 −16
Original line number Diff line number Diff line
@@ -1021,6 +1021,33 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
	spin_unlock(&session->s_cap_lock);
}

static void cleanup_session_requests(struct ceph_mds_client *mdsc,
				     struct ceph_mds_session *session)
{
	struct ceph_mds_request *req;
	struct rb_node *p;

	dout("cleanup_session_requests mds%d\n", session->s_mds);
	mutex_lock(&mdsc->mutex);
	while (!list_empty(&session->s_unsafe)) {
		req = list_first_entry(&session->s_unsafe,
				       struct ceph_mds_request, r_unsafe_item);
		list_del_init(&req->r_unsafe_item);
		pr_info(" dropping unsafe request %llu\n", req->r_tid);
		__unregister_request(mdsc, req);
	}
	/* zero r_attempts, so kick_requests() will re-send requests */
	p = rb_first(&mdsc->request_tree);
	while (p) {
		req = rb_entry(p, struct ceph_mds_request, r_node);
		p = rb_next(p);
		if (req->r_session &&
		    req->r_session->s_mds == session->s_mds)
			req->r_attempts = 0;
	}
	mutex_unlock(&mdsc->mutex);
}

/*
 * Helper to safely iterate over all caps associated with a session, with
 * special care taken to handle a racing __ceph_remove_cap().
@@ -1098,7 +1125,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
	     cap, ci, &ci->vfs_inode);
	spin_lock(&ci->i_ceph_lock);
	__ceph_remove_cap(cap, false);
	if (!__ceph_is_any_real_caps(ci)) {
	if (!ci->i_auth_cap) {
		struct ceph_mds_client *mdsc =
			ceph_sb_to_client(inode->i_sb)->mdsc;

@@ -1120,13 +1147,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
			mdsc->num_cap_flushing--;
			drop = 1;
		}
		if (drop && ci->i_wrbuffer_ref) {
			pr_info(" dropping dirty data for %p %lld\n",
				inode, ceph_ino(inode));
			ci->i_wrbuffer_ref = 0;
			ci->i_wrbuffer_ref_head = 0;
			drop++;
		}
		spin_unlock(&mdsc->cap_dirty_lock);
	}
	spin_unlock(&ci->i_ceph_lock);
@@ -1853,7 +1873,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 */
static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
					       struct ceph_mds_request *req,
					       int mds)
					       int mds, bool drop_cap_releases)
{
	struct ceph_msg *msg;
	struct ceph_mds_request_head *head;
@@ -1937,6 +1957,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
		releases += ceph_encode_inode_release(&p,
		      req->r_old_dentry->d_inode,
		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);

	if (drop_cap_releases) {
		releases = 0;
		p = msg->front.iov_base + req->r_request_release_offset;
	}

	head->num_releases = cpu_to_le16(releases);

	/* time stamp */
@@ -1989,7 +2015,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
 */
static int __prepare_send_request(struct ceph_mds_client *mdsc,
				  struct ceph_mds_request *req,
				  int mds)
				  int mds, bool drop_cap_releases)
{
	struct ceph_mds_request_head *rhead;
	struct ceph_msg *msg;
@@ -2048,7 +2074,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
		ceph_msg_put(req->r_request);
		req->r_request = NULL;
	}
	msg = create_request_message(mdsc, req, mds);
	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
	if (IS_ERR(msg)) {
		req->r_err = PTR_ERR(msg);
		complete_request(mdsc, req);
@@ -2132,7 +2158,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
	if (req->r_request_started == 0)   /* note request start time */
		req->r_request_started = jiffies;

	err = __prepare_send_request(mdsc, req, mds);
	err = __prepare_send_request(mdsc, req, mds, false);
	if (!err) {
		ceph_msg_get(req->r_request);
		ceph_con_send(&session->s_con, req->r_request);
@@ -2590,6 +2616,7 @@ static void handle_session(struct ceph_mds_session *session,
	case CEPH_SESSION_CLOSE:
		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
			pr_info("mds%d reconnect denied\n", session->s_mds);
		cleanup_session_requests(mdsc, session);
		remove_session_caps(session);
		wake = 2; /* for good measure */
		wake_up_all(&mdsc->session_close_wq);
@@ -2658,7 +2685,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,

	mutex_lock(&mdsc->mutex);
	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
		err = __prepare_send_request(mdsc, req, session->s_mds);
		err = __prepare_send_request(mdsc, req, session->s_mds, true);
		if (!err) {
			ceph_msg_get(req->r_request);
			ceph_con_send(&session->s_con, req->r_request);
@@ -2679,7 +2706,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
			continue; /* only old requests */
		if (req->r_session &&
		    req->r_session->s_mds == session->s_mds) {
			err = __prepare_send_request(mdsc, req, session->s_mds);
			err = __prepare_send_request(mdsc, req,
						     session->s_mds, true);
			if (!err) {
				ceph_msg_get(req->r_request);
				ceph_con_send(&session->s_con, req->r_request);
@@ -2864,6 +2892,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
	spin_unlock(&session->s_cap_lock);

	/* trim unused caps to reduce MDS's cache rejoin time */
	if (mdsc->fsc->sb->s_root)
		shrink_dcache_parent(mdsc->fsc->sb->s_root);

	ceph_con_close(&session->s_con);
@@ -3133,7 +3162,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
		    di->lease_renew_from &&
		    di->lease_renew_after == 0) {
			unsigned long duration =
				le32_to_cpu(h->duration_ms) * HZ / 1000;
				msecs_to_jiffies(le32_to_cpu(h->duration_ms));

			di->lease_seq = seq;
			dentry->d_time = di->lease_renew_from + duration;
Loading