Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d5a38f6e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull Ceph updates from Sage Weil:
 "There is quite a bit here, including some overdue refactoring and
  cleanup on the mon_client and osd_client code from Ilya, scattered
  writeback support for CephFS and a pile of bug fixes from Zheng, and a
  few random cleanups and fixes from others"

[ I already decided not to pull this because of it having been rebased
  recently, but ended up changing my mind after all.  Next time I'll
  really hold people to it.  Oh well.   - Linus ]

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (34 commits)
  libceph: use KMEM_CACHE macro
  ceph: use kmem_cache_zalloc
  rbd: use KMEM_CACHE macro
  ceph: use lookup request to revalidate dentry
  ceph: kill ceph_get_dentry_parent_inode()
  ceph: fix security xattr deadlock
  ceph: don't request vxattrs from MDS
  ceph: fix mounting same fs multiple times
  ceph: remove unnecessary NULL check
  ceph: avoid updating directory inode's i_size accidentally
  ceph: fix race during filling readdir cache
  libceph: use sizeof_footer() more
  ceph: kill ceph_empty_snapc
  ceph: fix a wrong comparison
  ceph: replace CURRENT_TIME by current_fs_time()
  ceph: scattered page writeback
  libceph: add helper that duplicates last extent operation
  libceph: enable large, variable-sized OSD requests
  libceph: osdc->req_mempool should be backed by a slab pool
  libceph: make r_request msg_size calculation clearer
  ...
parents 698f415c 5ee61e95
Loading
Loading
Loading
Loading
+3 −11
Original line number Original line Diff line number Diff line
@@ -1847,14 +1847,12 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
	if (osd_req->r_result < 0)
	if (osd_req->r_result < 0)
		obj_request->result = osd_req->r_result;
		obj_request->result = osd_req->r_result;


	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);

	/*
	/*
	 * We support a 64-bit length, but ultimately it has to be
	 * We support a 64-bit length, but ultimately it has to be
	 * passed to the block layer, which just supports a 32-bit
	 * passed to the block layer, which just supports a 32-bit
	 * length field.
	 * length field.
	 */
	 */
	obj_request->xferred = osd_req->r_reply_op_len[0];
	obj_request->xferred = osd_req->r_ops[0].outdata_len;
	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
	rbd_assert(obj_request->xferred < (u64)UINT_MAX);


	opcode = osd_req->r_ops[0].op;
	opcode = osd_req->r_ops[0].op;
@@ -5643,18 +5641,12 @@ static void rbd_sysfs_cleanup(void)
static int rbd_slab_init(void)
static int rbd_slab_init(void)
{
{
	rbd_assert(!rbd_img_request_cache);
	rbd_assert(!rbd_img_request_cache);
	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
					sizeof (struct rbd_img_request),
					__alignof__(struct rbd_img_request),
					0, NULL);
	if (!rbd_img_request_cache)
	if (!rbd_img_request_cache)
		return -ENOMEM;
		return -ENOMEM;


	rbd_assert(!rbd_obj_request_cache);
	rbd_assert(!rbd_obj_request_cache);
	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
					sizeof (struct rbd_obj_request),
					__alignof__(struct rbd_obj_request),
					0, NULL);
	if (!rbd_obj_request_cache)
	if (!rbd_obj_request_cache)
		goto out_err;
		goto out_err;


+204 −120
Original line number Original line Diff line number Diff line
@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,


static int ceph_releasepage(struct page *page, gfp_t g)
static int ceph_releasepage(struct page *page, gfp_t g)
{
{
	struct inode *inode = page->mapping ? page->mapping->host : NULL;
	dout("%p releasepage %p idx %lu\n", page->mapping->host,
	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
	     page, page->index);
	WARN_ON(PageDirty(page));
	WARN_ON(PageDirty(page));


	/* Can we release the page from the cache? */
	/* Can we release the page from the cache? */
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
	for (i = 0; i < num_pages; i++) {
	for (i = 0; i < num_pages; i++) {
		struct page *page = osd_data->pages[i];
		struct page *page = osd_data->pages[i];


		if (rc < 0 && rc != ENOENT)
		if (rc < 0 && rc != -ENOENT)
			goto unlock;
			goto unlock;
		if (bytes < (int)PAGE_CACHE_SIZE) {
		if (bytes < (int)PAGE_CACHE_SIZE) {
			/* zero (remainder of) page */
			/* zero (remainder of) page */
@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
	struct inode *inode = req->r_inode;
	struct inode *inode = req->r_inode;
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_osd_data *osd_data;
	struct ceph_osd_data *osd_data;
	unsigned wrote;
	struct page *page;
	struct page *page;
	int num_pages;
	int num_pages, total_pages = 0;
	int i;
	int i, j;
	int rc = req->r_result;
	struct ceph_snap_context *snapc = req->r_snapc;
	struct ceph_snap_context *snapc = req->r_snapc;
	struct address_space *mapping = inode->i_mapping;
	struct address_space *mapping = inode->i_mapping;
	int rc = req->r_result;
	u64 bytes = req->r_ops[0].extent.length;
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	long writeback_stat;
	bool remove_page;
	unsigned issued = ceph_caps_issued(ci);


	dout("writepages_finish %p rc %d\n", inode, rc);
	if (rc < 0)
		mapping_set_error(mapping, rc);


	osd_data = osd_req_op_extent_osd_data(req, 0);
	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
	num_pages = calc_pages_for((u64)osd_data->alignment,
					(u64)osd_data->length);
	if (rc >= 0) {
	/*
	/*
		 * Assume we wrote the pages we originally sent.  The
	 * We lost the cache cap, need to truncate the page before
		 * osd might reply with fewer pages if our writeback
	 * it is unlocked, otherwise we'd truncate it later in the
		 * raced with a truncation and was adjusted at the osd,
	 * page truncation thread, possibly losing some data that
		 * so don't believe the reply.
	 * raced its way in
	 */
	 */
		wrote = num_pages;
	remove_page = !(ceph_caps_issued(ci) &
	} else {
			(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
		wrote = 0;
		mapping_set_error(mapping, rc);
	}
	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
	     inode, rc, bytes, wrote);


	/* clean all pages */
	/* clean all pages */
	for (i = 0; i < num_pages; i++) {
	for (i = 0; i < req->r_num_ops; i++) {
		page = osd_data->pages[i];
		if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
			break;

		osd_data = osd_req_op_extent_osd_data(req, i);
		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
		num_pages = calc_pages_for((u64)osd_data->alignment,
					   (u64)osd_data->length);
		total_pages += num_pages;
		for (j = 0; j < num_pages; j++) {
			page = osd_data->pages[j];
			BUG_ON(!page);
			BUG_ON(!page);
			WARN_ON(!PageUptodate(page));
			WARN_ON(!PageUptodate(page));


		writeback_stat =
			if (atomic_long_dec_return(&fsc->writeback_count) <
			atomic_long_dec_return(&fsc->writeback_count);
			     CONGESTION_OFF_THRESH(
		if (writeback_stat <
					fsc->mount_options->congestion_kb))
		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
				clear_bdi_congested(&fsc->backing_dev_info,
				clear_bdi_congested(&fsc->backing_dev_info,
						    BLK_RW_ASYNC);
						    BLK_RW_ASYNC);


			ceph_put_snap_context(page_snap_context(page));
			ceph_put_snap_context(page_snap_context(page));
			page->private = 0;
			page->private = 0;
			ClearPagePrivate(page);
			ClearPagePrivate(page);
		dout("unlocking %d %p\n", i, page);
			dout("unlocking %p\n", page);
			end_page_writeback(page);
			end_page_writeback(page);


		/*
			if (remove_page)
		 * We lost the cache cap, need to truncate the page before
				generic_error_remove_page(inode->i_mapping,
		 * it is unlocked, otherwise we'd truncate it later in the
							  page);
		 * page truncation thread, possibly losing some data that
		 * raced its way in
		 */
		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
			generic_error_remove_page(inode->i_mapping, page);


			unlock_page(page);
			unlock_page(page);
		}
		}
	dout("%p wrote+cleaned %d pages\n", inode, wrote);
		dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
		     inode, osd_data->length, rc >= 0 ? num_pages : 0);


		ceph_release_pages(osd_data->pages, num_pages);
		ceph_release_pages(osd_data->pages, num_pages);
	}

	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);

	osd_data = osd_req_op_extent_osd_data(req, 0);
	if (osd_data->pages_from_pool)
	if (osd_data->pages_from_pool)
		mempool_free(osd_data->pages,
		mempool_free(osd_data->pages,
			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -778,17 +778,15 @@ static int ceph_writepages_start(struct address_space *mapping,
	while (!done && index <= end) {
	while (!done && index <= end) {
		unsigned i;
		unsigned i;
		int first;
		int first;
		pgoff_t next;
		pgoff_t strip_unit_end = 0;
		int pvec_pages, locked_pages;
		int num_ops = 0, op_idx;
		struct page **pages = NULL;
		int pvec_pages, locked_pages = 0;
		struct page **pages = NULL, **data_pages;
		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
		struct page *page;
		struct page *page;
		int want;
		int want;
		u64 offset, len;
		u64 offset = 0, len = 0;
		long writeback_stat;


		next = 0;
		locked_pages = 0;
		max_pages = max_pages_ever;
		max_pages = max_pages_ever;


get_more_pages:
get_more_pages:
@@ -824,8 +822,8 @@ static int ceph_writepages_start(struct address_space *mapping,
				unlock_page(page);
				unlock_page(page);
				break;
				break;
			}
			}
			if (next && (page->index != next)) {
			if (strip_unit_end && (page->index > strip_unit_end)) {
				dout("not consecutive %p\n", page);
				dout("end of strip unit %p\n", page);
				unlock_page(page);
				unlock_page(page);
				break;
				break;
			}
			}
@@ -867,36 +865,31 @@ static int ceph_writepages_start(struct address_space *mapping,
			/*
			/*
			 * We have something to write.  If this is
			 * We have something to write.  If this is
			 * the first locked page this time through,
			 * the first locked page this time through,
			 * allocate an osd request and a page array
			 * calculate max possinle write size and
			 * that it will use.
			 * allocate a page array
			 */
			 */
			if (locked_pages == 0) {
			if (locked_pages == 0) {
				BUG_ON(pages);
				u64 objnum;
				u64 objoff;

				/* prepare async write request */
				/* prepare async write request */
				offset = (u64)page_offset(page);
				offset = (u64)page_offset(page);
				len = wsize;
				len = wsize;
				req = ceph_osdc_new_request(&fsc->client->osdc,

							&ci->i_layout, vino,
				rc = ceph_calc_file_object_mapping(&ci->i_layout,
							offset, &len, 0,
								offset, len,
							do_sync ? 2 : 1,
								&objnum, &objoff,
							CEPH_OSD_OP_WRITE,
								&len);
							CEPH_OSD_FLAG_WRITE |
				if (rc < 0) {
							CEPH_OSD_FLAG_ONDISK,
							snapc, truncate_seq,
							truncate_size, true);
				if (IS_ERR(req)) {
					rc = PTR_ERR(req);
					unlock_page(page);
					unlock_page(page);
					break;
					break;
				}
				}


				if (do_sync)
				num_ops = 1 + do_sync;
					osd_req_op_init(req, 1,
				strip_unit_end = page->index +
							CEPH_OSD_OP_STARTSYNC, 0);
					((len - 1) >> PAGE_CACHE_SHIFT);

				req->r_callback = writepages_finish;
				req->r_inode = inode;


				BUG_ON(pages);
				max_pages = calc_pages_for(0, (u64)len);
				max_pages = calc_pages_for(0, (u64)len);
				pages = kmalloc(max_pages * sizeof (*pages),
				pages = kmalloc(max_pages * sizeof (*pages),
						GFP_NOFS);
						GFP_NOFS);
@@ -905,6 +898,20 @@ static int ceph_writepages_start(struct address_space *mapping,
					pages = mempool_alloc(pool, GFP_NOFS);
					pages = mempool_alloc(pool, GFP_NOFS);
					BUG_ON(!pages);
					BUG_ON(!pages);
				}
				}

				len = 0;
			} else if (page->index !=
				   (offset + len) >> PAGE_CACHE_SHIFT) {
				if (num_ops >= (pool ?  CEPH_OSD_SLAB_OPS :
							CEPH_OSD_MAX_OPS)) {
					redirty_page_for_writepage(wbc, page);
					unlock_page(page);
					break;
				}

				num_ops++;
				offset = (u64)page_offset(page);
				len = 0;
			}
			}


			/* note position of first page in pvec */
			/* note position of first page in pvec */
@@ -913,18 +920,16 @@ static int ceph_writepages_start(struct address_space *mapping,
			dout("%p will write page %p idx %lu\n",
			dout("%p will write page %p idx %lu\n",
			     inode, page, page->index);
			     inode, page, page->index);


			writeback_stat =
			if (atomic_long_inc_return(&fsc->writeback_count) >
			       atomic_long_inc_return(&fsc->writeback_count);
			    CONGESTION_ON_THRESH(
			if (writeback_stat > CONGESTION_ON_THRESH(
				    fsc->mount_options->congestion_kb)) {
				    fsc->mount_options->congestion_kb)) {
				set_bdi_congested(&fsc->backing_dev_info,
				set_bdi_congested(&fsc->backing_dev_info,
						  BLK_RW_ASYNC);
						  BLK_RW_ASYNC);
			}
			}


			set_page_writeback(page);
			pages[locked_pages] = page;
			pages[locked_pages] = page;
			locked_pages++;
			locked_pages++;
			next = page->index + 1;
			len += PAGE_CACHE_SIZE;
		}
		}


		/* did we get anything? */
		/* did we get anything? */
@@ -944,38 +949,119 @@ static int ceph_writepages_start(struct address_space *mapping,
			/* shift unused pages over in the pvec...  we
			/* shift unused pages over in the pvec...  we
			 * will need to release them below. */
			 * will need to release them below. */
			for (j = i; j < pvec_pages; j++) {
			for (j = i; j < pvec_pages; j++) {
				dout(" pvec leftover page %p\n",
				dout(" pvec leftover page %p\n", pvec.pages[j]);
				     pvec.pages[j]);
				pvec.pages[j-i+first] = pvec.pages[j];
				pvec.pages[j-i+first] = pvec.pages[j];
			}
			}
			pvec.nr -= i-first;
			pvec.nr -= i-first;
		}
		}


		/* Format the osd request message and submit the write */
new_request:
		offset = page_offset(pages[0]);
		offset = page_offset(pages[0]);
		len = (u64)locked_pages << PAGE_CACHE_SHIFT;
		len = wsize;
		if (snap_size == -1) {

			len = min(len, (u64)i_size_read(inode) - offset);
		req = ceph_osdc_new_request(&fsc->client->osdc,
					&ci->i_layout, vino,
					offset, &len, 0, num_ops,
					CEPH_OSD_OP_WRITE,
					CEPH_OSD_FLAG_WRITE |
					CEPH_OSD_FLAG_ONDISK,
					snapc, truncate_seq,
					truncate_size, false);
		if (IS_ERR(req)) {
			req = ceph_osdc_new_request(&fsc->client->osdc,
						&ci->i_layout, vino,
						offset, &len, 0,
						min(num_ops,
						    CEPH_OSD_SLAB_OPS),
						CEPH_OSD_OP_WRITE,
						CEPH_OSD_FLAG_WRITE |
						CEPH_OSD_FLAG_ONDISK,
						snapc, truncate_seq,
						truncate_size, true);
			BUG_ON(IS_ERR(req));
		}
		BUG_ON(len < page_offset(pages[locked_pages - 1]) +
			     PAGE_CACHE_SIZE - offset);

		req->r_callback = writepages_finish;
		req->r_inode = inode;

		/* Format the osd request message and submit the write */
		len = 0;
		data_pages = pages;
		op_idx = 0;
		for (i = 0; i < locked_pages; i++) {
			u64 cur_offset = page_offset(pages[i]);
			if (offset + len != cur_offset) {
				if (op_idx + do_sync + 1 == req->r_num_ops)
					break;
				osd_req_op_extent_dup_last(req, op_idx,
							   cur_offset - offset);
				dout("writepages got pages at %llu~%llu\n",
				     offset, len);
				osd_req_op_extent_osd_data_pages(req, op_idx,
							data_pages, len, 0,
							!!pool, false);
				osd_req_op_extent_update(req, op_idx, len);

				len = 0;
				offset = cur_offset; 
				data_pages = pages + i;
				op_idx++;
			}

			set_page_writeback(pages[i]);
			len += PAGE_CACHE_SIZE;
		}

		if (snap_size != -1) {
			len = min(len, snap_size - offset);
		} else if (i == locked_pages) {
			/* writepages_finish() clears writeback pages
			/* writepages_finish() clears writeback pages
			 * according to the data length, so make sure
			 * according to the data length, so make sure
			 * data length covers all locked pages */
			 * data length covers all locked pages */
			len = max(len, 1 +
			u64 min_len = len + 1 - PAGE_CACHE_SIZE;
				((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
			len = min(len, (u64)i_size_read(inode) - offset);
		} else {
			len = max(len, min_len);
			len = min(len, snap_size - offset);
		}
		}
		dout("writepages got %d pages at %llu~%llu\n",
		dout("writepages got pages at %llu~%llu\n", offset, len);
		     locked_pages, offset, len);

		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
							!!pool, false);


		pages = NULL;	/* request message now owns the pages array */
		osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
		pool = NULL;
						 0, !!pool, false);
		osd_req_op_extent_update(req, op_idx, len);


		/* Update the write op length in case we changed it */
		if (do_sync) {
			op_idx++;
			osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
		}
		BUG_ON(op_idx + 1 != req->r_num_ops);


		osd_req_op_extent_update(req, 0, len);
		pool = NULL;
		if (i < locked_pages) {
			BUG_ON(num_ops <= req->r_num_ops);
			num_ops -= req->r_num_ops;
			num_ops += do_sync;
			locked_pages -= i;

			/* allocate new pages array for next request */
			data_pages = pages;
			pages = kmalloc(locked_pages * sizeof (*pages),
					GFP_NOFS);
			if (!pages) {
				pool = fsc->wb_pagevec_pool;
				pages = mempool_alloc(pool, GFP_NOFS);
				BUG_ON(!pages);
			}
			memcpy(pages, data_pages + i,
			       locked_pages * sizeof(*pages));
			memset(data_pages + i, 0,
			       locked_pages * sizeof(*pages));
		} else {
			BUG_ON(num_ops != req->r_num_ops);
			index = pages[i - 1]->index + 1;
			/* request message now owns the pages array */
			pages = NULL;
		}


		vino = ceph_vino(inode);
		vino = ceph_vino(inode);
		ceph_osdc_build_request(req, offset, snapc, vino.snap,
		ceph_osdc_build_request(req, offset, snapc, vino.snap,
@@ -985,9 +1071,10 @@ static int ceph_writepages_start(struct address_space *mapping,
		BUG_ON(rc);
		BUG_ON(rc);
		req = NULL;
		req = NULL;


		/* continue? */
		wbc->nr_to_write -= i;
		index = next;
		if (pages)
		wbc->nr_to_write -= locked_pages;
			goto new_request;

		if (wbc->nr_to_write <= 0)
		if (wbc->nr_to_write <= 0)
			done = 1;
			done = 1;


@@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
				    ceph_vino(inode), 0, &len, 0, 1,
				    ceph_vino(inode), 0, &len, 0, 1,
				    CEPH_OSD_OP_CREATE,
				    CEPH_OSD_OP_CREATE,
				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
				    ceph_empty_snapc, 0, 0, false);
				    NULL, 0, 0, false);
	if (IS_ERR(req)) {
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		err = PTR_ERR(req);
		goto out;
		goto out;
@@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
				    ceph_vino(inode), 0, &len, 1, 3,
				    ceph_vino(inode), 0, &len, 1, 3,
				    CEPH_OSD_OP_WRITE,
				    CEPH_OSD_OP_WRITE,
				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
				    ceph_empty_snapc,
				    NULL, ci->i_truncate_seq,
				    ci->i_truncate_seq, ci->i_truncate_size,
				    ci->i_truncate_size, false);
				    false);
	if (IS_ERR(req)) {
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		err = PTR_ERR(req);
		goto out;
		goto out;
@@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
		goto out;
		goto out;
	}
	}


	rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
	rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
					 ceph_empty_snapc,
					 1, false, GFP_NOFS);
					 1, false, GFP_NOFS);
	if (!rd_req) {
	if (!rd_req) {
		err = -ENOMEM;
		err = -ENOMEM;
@@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
		 "%llx.00000000", ci->i_vino.ino);
		 "%llx.00000000", ci->i_vino.ino);
	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);


	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
					 ceph_empty_snapc,
					 1, false, GFP_NOFS);
					 1, false, GFP_NOFS);
	if (!wr_req) {
	if (!wr_req) {
		err = -ENOMEM;
		err = -ENOMEM;
+7 −4
Original line number Original line Diff line number Diff line
@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
			u32 seq, u64 flush_tid, u64 oldest_flush_tid,
			u32 seq, u64 flush_tid, u64 oldest_flush_tid,
			u32 issue_seq, u32 mseq, u64 size, u64 max_size,
			u32 issue_seq, u32 mseq, u64 size, u64 max_size,
			struct timespec *mtime, struct timespec *atime,
			struct timespec *mtime, struct timespec *atime,
			u64 time_warp_seq,
			struct timespec *ctime, u64 time_warp_seq,
			kuid_t uid, kgid_t gid, umode_t mode,
			kuid_t uid, kgid_t gid, umode_t mode,
			u64 xattr_version,
			u64 xattr_version,
			struct ceph_buffer *xattrs_buf,
			struct ceph_buffer *xattrs_buf,
@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
		ceph_encode_timespec(&fc->mtime, mtime);
		ceph_encode_timespec(&fc->mtime, mtime);
	if (atime)
	if (atime)
		ceph_encode_timespec(&fc->atime, atime);
		ceph_encode_timespec(&fc->atime, atime);
	if (ctime)
		ceph_encode_timespec(&fc->ctime, ctime);
	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
	fc->time_warp_seq = cpu_to_le32(time_warp_seq);


	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
	int held, revoking, dropping, keep;
	int held, revoking, dropping, keep;
	u64 seq, issue_seq, mseq, time_warp_seq, follows;
	u64 seq, issue_seq, mseq, time_warp_seq, follows;
	u64 size, max_size;
	u64 size, max_size;
	struct timespec mtime, atime;
	struct timespec mtime, atime, ctime;
	int wake = 0;
	int wake = 0;
	umode_t mode;
	umode_t mode;
	kuid_t uid;
	kuid_t uid;
@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
	ci->i_requested_max_size = max_size;
	ci->i_requested_max_size = max_size;
	mtime = inode->i_mtime;
	mtime = inode->i_mtime;
	atime = inode->i_atime;
	atime = inode->i_atime;
	ctime = inode->i_ctime;
	time_warp_seq = ci->i_time_warp_seq;
	time_warp_seq = ci->i_time_warp_seq;
	uid = inode->i_uid;
	uid = inode->i_uid;
	gid = inode->i_gid;
	gid = inode->i_gid;
@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
		op, keep, want, flushing, seq,
		op, keep, want, flushing, seq,
		flush_tid, oldest_flush_tid, issue_seq, mseq,
		flush_tid, oldest_flush_tid, issue_seq, mseq,
		size, max_size, &mtime, &atime, time_warp_seq,
		size, max_size, &mtime, &atime, &ctime, time_warp_seq,
		uid, gid, mode, xattr_version, xattr_blob,
		uid, gid, mode, xattr_version, xattr_blob,
		follows, inline_data);
		follows, inline_data);
	if (ret < 0) {
	if (ret < 0) {
@@ -1320,7 +1323,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
			     capsnap->dirty, 0, capsnap->flush_tid, 0,
			     capsnap->dirty, 0, capsnap->flush_tid, 0,
			     0, mseq, capsnap->size, 0,
			     0, mseq, capsnap->size, 0,
			     &capsnap->mtime, &capsnap->atime,
			     &capsnap->mtime, &capsnap->atime,
			     capsnap->time_warp_seq,
			     &capsnap->ctime, capsnap->time_warp_seq,
			     capsnap->uid, capsnap->gid, capsnap->mode,
			     capsnap->uid, capsnap->gid, capsnap->mode,
			     capsnap->xattr_version, capsnap->xattr_blob,
			     capsnap->xattr_version, capsnap->xattr_blob,
			     capsnap->follows, capsnap->inline_data);
			     capsnap->follows, capsnap->inline_data);
+47 −22
Original line number Original line Diff line number Diff line
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
	if (dentry->d_fsdata)
	if (dentry->d_fsdata)
		return 0;
		return 0;


	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
	di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
	if (!di)
	if (!di)
		return -ENOMEM;          /* oh well */
		return -ENOMEM;          /* oh well */


@@ -68,23 +68,6 @@ int ceph_init_dentry(struct dentry *dentry)
	return 0;
	return 0;
}
}


struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
{
	struct inode *inode = NULL;

	if (!dentry)
		return NULL;

	spin_lock(&dentry->d_lock);
	if (!IS_ROOT(dentry)) {
		inode = d_inode(dentry->d_parent);
		ihold(inode);
	}
	spin_unlock(&dentry->d_lock);
	return inode;
}


/*
/*
 * for readdir, we encode the directory frag and offset within that
 * for readdir, we encode the directory frag and offset within that
 * frag into f_pos.
 * frag into f_pos.
@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
	struct ceph_mds_client *mdsc = fsc->mdsc;
	struct ceph_mds_client *mdsc = fsc->mdsc;
	struct ceph_mds_request *req;
	struct ceph_mds_request *req;
	int op;
	int op;
	int mask;
	int err;
	int err;


	dout("lookup %p dentry %p '%pd'\n",
	dout("lookup %p dentry %p '%pd'\n",
@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
		return ERR_CAST(req);
		return ERR_CAST(req);
	req->r_dentry = dget(dentry);
	req->r_dentry = dget(dentry);
	req->r_num_caps = 2;
	req->r_num_caps = 2;
	/* we only need inode linkage */

	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
	if (ceph_security_xattr_wanted(dir))
		mask |= CEPH_CAP_XATTR_SHARED;
	req->r_args.getattr.mask = cpu_to_le32(mask);

	req->r_locked_dir = dir;
	req->r_locked_dir = dir;
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	err = ceph_handle_snapdir(req, dentry, err);
	err = ceph_handle_snapdir(req, dentry, err);
@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
{
	int valid = 0;
	int valid = 0;
	struct dentry *parent;
	struct inode *dir;
	struct inode *dir;


	if (flags & LOOKUP_RCU)
	if (flags & LOOKUP_RCU)
@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
	dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
	dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
	     dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
	     dentry, d_inode(dentry), ceph_dentry(dentry)->offset);


	dir = ceph_get_dentry_parent_inode(dentry);
	parent = dget_parent(dentry);
	dir = d_inode(parent);


	/* always trust cached snapped dentries, snapdir dentry */
	/* always trust cached snapped dentries, snapdir dentry */
	if (ceph_snap(dir) != CEPH_NOSNAP) {
	if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
			valid = 1;
			valid = 1;
	}
	}


	if (!valid) {
		struct ceph_mds_client *mdsc =
			ceph_sb_to_client(dir->i_sb)->mdsc;
		struct ceph_mds_request *req;
		int op, mask, err;

		op = ceph_snap(dir) == CEPH_SNAPDIR ?
			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
		if (!IS_ERR(req)) {
			req->r_dentry = dget(dentry);
			req->r_num_caps = 2;

			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
			if (ceph_security_xattr_wanted(dir))
				mask |= CEPH_CAP_XATTR_SHARED;
			req->r_args.getattr.mask = mask;

			req->r_locked_dir = dir;
			err = ceph_mdsc_do_request(mdsc, NULL, req);
			if (err == 0 || err == -ENOENT) {
				if (dentry == req->r_dentry) {
					valid = !d_unhashed(dentry);
				} else {
					d_invalidate(req->r_dentry);
					err = -EAGAIN;
				}
			}
			ceph_mdsc_put_request(req);
			dout("d_revalidate %p lookup result=%d\n",
			     dentry, err);
		}
	}

	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
	if (valid) {
	if (valid) {
		ceph_dentry_lru_touch(dentry);
		ceph_dentry_lru_touch(dentry);
	} else {
	} else {
		ceph_dir_clear_complete(dir);
		ceph_dir_clear_complete(dir);
	}
	}
	iput(dir);

	dput(parent);
	return valid;
	return valid;
}
}


+13 −0
Original line number Original line Diff line number Diff line
@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
	inode = ceph_find_inode(sb, vino);
	inode = ceph_find_inode(sb, vino);
	if (!inode) {
	if (!inode) {
		struct ceph_mds_request *req;
		struct ceph_mds_request *req;
		int mask;


		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
					       USE_ANY_MDS);
					       USE_ANY_MDS);
		if (IS_ERR(req))
		if (IS_ERR(req))
			return ERR_CAST(req);
			return ERR_CAST(req);


		mask = CEPH_STAT_CAP_INODE;
		if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
			mask |= CEPH_CAP_XATTR_SHARED;
		req->r_args.getattr.mask = cpu_to_le32(mask);

		req->r_ino1 = vino;
		req->r_ino1 = vino;
		req->r_num_caps = 1;
		req->r_num_caps = 1;
		err = ceph_mdsc_do_request(mdsc, NULL, req);
		err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb,
	struct ceph_mds_request *req;
	struct ceph_mds_request *req;
	struct inode *inode;
	struct inode *inode;
	struct dentry *dentry;
	struct dentry *dentry;
	int mask;
	int err;
	int err;


	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb,
			.snap = CEPH_NOSNAP,
			.snap = CEPH_NOSNAP,
		};
		};
	}
	}

	mask = CEPH_STAT_CAP_INODE;
	if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
		mask |= CEPH_CAP_XATTR_SHARED;
	req->r_args.getattr.mask = cpu_to_le32(mask);

	req->r_num_caps = 1;
	req->r_num_caps = 1;
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	inode = req->r_target_inode;
	inode = req->r_target_inode;
Loading