Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f41def39 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - automatic recovery of a blacklisted filesystem session (Zheng Yan).
     This is disabled by default and can be enabled by mounting with the
     new "recover_session=clean" option.

   - serialize buffered reads and O_DIRECT writes (Jeff Layton). Care is
     taken to avoid serializing O_DIRECT reads and writes with each
     other, this is based on the exclusion scheme from NFS.

   - handle large osdmaps better in the face of fragmented memory
     (myself)

   - don't limit what security.* xattrs can be get or set (Jeff Layton).
     We were overly restrictive here, unnecessarily preventing things
     like file capability sets stored in security.capability from
     working.

   - allow copy_file_range() within the same inode and across different
     filesystems within the same cluster (Luis Henriques)"

* tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client: (41 commits)
  ceph: call ceph_mdsc_destroy from destroy_fs_client
  libceph: use ceph_kvmalloc() for osdmap arrays
  libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc()
  ceph: allow object copies across different filesystems in the same cluster
  ceph: include ceph_debug.h in cache.c
  ceph: move static keyword to the front of declarations
  rbd: pull rbd_img_request_create() dout out into the callers
  ceph: reconnect connection if session hang in opening state
  libceph: drop unused con parameter of calc_target()
  ceph: use release_pages() directly
  rbd: fix response length parameter for encoded strings
  ceph: allow arbitrary security.* xattrs
  ceph: only set CEPH_I_SEC_INITED if we got a MAC label
  ceph: turn ceph_security_invalidate_secctx into static inline
  ceph: add buffered/direct exclusionary locking for reads and writes
  libceph: handle OSD op ceph_pagelist_append() errors
  ceph: don't return a value from void function
  ceph: don't freeze during write page faults
  ceph: update the mtime when truncating up
  ceph: fix indentation in __get_snap_name()
  ...
parents 7b1373dd 3ee5a701
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -158,6 +158,20 @@ Mount Options
        copies.  Currently, it's only used in copy_file_range, which will revert
        to the default VFS implementation if this option is used.

  recover_session=<no|clean>
	Set auto reconnect mode in the case where the client is blacklisted. The
	available modes are "no" and "clean". The default is "no".

	* no: never attempt to reconnect when client detects that it has been
	blacklisted. Operations will generally fail after being blacklisted.

	* clean: client reconnects to the ceph cluster automatically when it
	detects that it has been blacklisted. During reconnect, client drops
	dirty data/metadata, invalidates page caches and writable file handles.
	After reconnect, file locks become stale because the MDS loses track
	of them. If an inode contains any stale file locks, read/write on the
	inode is not allowed until applications release all stale file locks.

More Information
================

+12 −6
Original line number Diff line number Diff line
@@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create(
	mutex_init(&img_request->state_mutex);
	kref_init(&img_request->kref);

	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
	     obj_op_name(op_type), img_request);
	return img_request;
}

@@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
	child_img_req->obj_request = obj_req;

	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
	     obj_req);

	if (!rbd_img_is_write(img_req)) {
		switch (img_req->data_type) {
		case OBJ_REQUEST_BIO:
@@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work)
	img_request->rq = rq;
	snapc = NULL; /* img_request consumes a ref */

	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
	     img_request, obj_op_name(op_type), offset, length);

	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
		result = rbd_img_fill_nodata(img_request, offset, length);
	else
@@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)

static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
{
	size_t size;
	void *reply_buf;
	int ret;
	void *p;

	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
	/* Response will be an encoded string, which includes a length */
	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
	reply_buf = kzalloc(size, GFP_KERNEL);
	if (!reply_buf)
		return -ENOMEM;

	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
				  &rbd_dev->header_oloc, "get_object_prefix",
				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
				  NULL, 0, reply_buf, size);
	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
	if (ret < 0)
		goto out;
@@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
	dout("rbd id object name is %s\n", oid.name);

	/* Response will be an encoded string, which includes a length */

	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
	response = kzalloc(size, GFP_NOIO);
	if (!response) {
@@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)

	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
				  "get_id", NULL, 0,
				  response, RBD_IMAGE_ID_LEN_MAX);
				  response, size);
	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
	if (ret == -ENOENT) {
		image_id = kstrdup("", GFP_KERNEL);
+1 −1
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@
obj-$(CONFIG_CEPH_FS) += ceph.o

ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
	export.o caps.o snap.o xattr.o quota.o \
	export.o caps.o snap.o xattr.o quota.o io.o \
	mds_client.o mdsmap.o strings.o ceph_frag.o \
	debugfs.o

+30 −31
Original line number Diff line number Diff line
@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
{
	struct inode *inode = file_inode(filp);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_osd_client *osdc =
		&ceph_inode_to_client(inode)->client->osdc;
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	int err = 0;
	u64 off = page_offset(page);
	u64 len = PAGE_SIZE;
@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)

	dout("readpage inode %p file %p page %p index %lu\n",
	     inode, filp, page, page->index);
	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
				  off, &len,
	err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
				  &ci->i_layout, off, &len,
				  ci->i_truncate_seq, ci->i_truncate_size,
				  &page, 1, 0);
	if (err == -ENOENT)
@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
	if (err < 0) {
		SetPageError(page);
		ceph_fscache_readpage_cancel(inode, page);
		if (err == -EBLACKLISTED)
			fsc->blacklisted = true;
		goto out;
	}
	if (err < PAGE_SIZE)
@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
	int i;

	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
	if (rc == -EBLACKLISTED)
		ceph_inode_to_client(inode)->blacklisted = true;

	/* unlock all pages, zeroing any data we didn't read */
	osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
		/* caller of readpages does not hold buffer and read caps
		 * (fadvise, madvise and readahead cases) */
		int want = CEPH_CAP_FILE_CACHE;
		ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
		ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
					true, &got);
		if (ret < 0) {
			dout("start_read %p, error getting cap\n", inode);
		} else if (!(got & want)) {
@@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode,
/*
 * Write a single page, but leave the page locked.
 *
 * If we get a write error, set the page error bit, but still adjust the
 * If we get a write error, mark the mapping for error, but still adjust the
 * dirty page accounting (i.e., page is no longer dirty).
 */
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
@@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
			end_page_writeback(page);
			return err;
		}
		if (err == -EBLACKLISTED)
			fsc->blacklisted = true;
		dout("writepage setting page/mapping error %d %p\n",
		     err, page);
		SetPageError(page);
		mapping_set_error(&inode->i_data, err);
		wbc->pages_skipped++;
	} else {
@@ -679,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
	return err;
}

/*
 * lame release_pages helper.  release_pages() isn't exported to
 * modules.
 */
static void ceph_release_pages(struct page **pages, int num)
{
	struct pagevec pvec;
	int i;

	pagevec_init(&pvec);
	for (i = 0; i < num; i++) {
		if (pagevec_add(&pvec, pages[i]) == 0)
			pagevec_release(&pvec);
	}
	pagevec_release(&pvec);
}

/*
 * async writeback completion handler.
 *
@@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req)
	if (rc < 0) {
		mapping_set_error(mapping, rc);
		ceph_set_error_write(ci);
		if (rc == -EBLACKLISTED)
			fsc->blacklisted = true;
	} else {
		ceph_clear_error_write(ci);
	}
@@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req)
		dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
		     inode, osd_data->length, rc >= 0 ? num_pages : 0);

		ceph_release_pages(osd_data->pages, num_pages);
		release_pages(osd_data->pages, num_pages);
	}

	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
@@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
		want = CEPH_CAP_FILE_CACHE;

	got = 0;
	err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
			    &got, &pinned_page);
	if (err < 0)
		goto out_restore;

@@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
	if (!prealloc_cf)
		return VM_FAULT_OOM;

	sb_start_pagefault(inode->i_sb);
	ceph_block_sigs(&oldset);

	if (ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
		want = CEPH_CAP_FILE_BUFFER;

	got = 0;
	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
			    &got, NULL);
	if (err < 0)
		goto out_free;
@@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
	ceph_put_cap_refs(ci, got);
out_free:
	ceph_restore_sigs(&oldset);
	sb_end_pagefault(inode->i_sb);
	ceph_free_cap_flush(prealloc_cf);
	if (err < 0)
		ret = vmf_error(err);
@@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,

	if (err >= 0 || err == -ENOENT)
		have |= POOL_READ;
	else if (err != -EPERM)
	else if (err != -EPERM) {
		if (err == -EBLACKLISTED)
			fsc->blacklisted = true;
		goto out_unlock;
	}

	if (err2 == 0 || err2 == -EEXIST)
		have |= POOL_WRITE;
	else if (err2 != -EPERM) {
		if (err2 == -EBLACKLISTED)
			fsc->blacklisted = true;
		err = err2;
		goto out_unlock;
	}
@@ -1989,10 +1987,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
	return err;
}

int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
int ceph_pool_perm_check(struct inode *inode, int need)
{
	s64 pool;
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_string *pool_ns;
	s64 pool;
	int ret, flags;

	if (ci->i_vino.snap != CEPH_NOSNAP) {
@@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
		return 0;
	}

	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
	if (ceph_test_mount_opt(ceph_inode_to_client(inode),
				NOPOOLPERM))
		return 0;

+2 −0
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@
 *  Written by Milosz Tanski (milosz@adfin.com)
 */

#include <linux/ceph/ceph_debug.h>

#include "super.h"
#include "cache.h"

Loading