Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7f155c70 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable bugfixes:
   - nfs: don't create zero-length requests

   - several LAYOUTGET bugfixes

  Features:
   - several performance related features

   - more aggressive caching when we can rely on close-to-open
     cache consistency

   - remove serialisation of O_DIRECT reads and writes

   - optimise several code paths to not flush to disk unnecessarily.

     However allow for the idiosyncracies of pNFS for those layout
     types that need to issue a LAYOUTCOMMIT before the metadata can
     be updated on the server.

   - SUNRPC updates to the client data receive path

   - pNFS/SCSI support RH/Fedora dm-mpath device nodes

   - pNFS files/flexfiles can now use unprivileged ports when
     the generic NFS mount options allow it.

  Bugfixes:
   - Don't use RDMA direct data placement together with data
     integrity or privacy security flavours

   - Remove the RDMA ALLPHYSICAL memory registration mode as
     it has potential security holes.

   - Several layout recall fixes to improve NFSv4.1 protocol
     compliance.

   - Fix an Oops in the pNFS files and flexfiles connection
     setup to the DS

   - Allow retry of operations that used a returned delegation
      stateid

   - Don't mark the inode as revalidated if a LAYOUTCOMMIT is
     outstanding

   - Fix writeback races in nfs4_copy_range() and
     nfs42_proc_deallocate()"

* tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits)
  pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding
  NFSv4: Clean up lookup of SECINFO_NO_NAME
  NFSv4.2: Fix warning "variable ‘stateids’ set but not used"
  NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’"
  SUNRPC: Fix a compiler warning in fs/nfs/clnt.c
  pNFS: Remove redundant smp_mb() from pnfs_init_lseg()
  pNFS: Cleanup - do layout segment initialisation in one place
  pNFS: Remove redundant stateid invalidation
  pNFS: Remove redundant pnfs_mark_layout_returned_if_empty()
  pNFS: Clear the layout metadata if the server changed the layout stateid
  pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid()
  NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id
  pNFS: Do not set plh_return_seq for non-callback related layoutreturns
  pNFS: Ensure layoutreturn acts as a completion for layout callbacks
  pNFS: Fix CB_LAYOUTRECALL stateid verification
  pNFS: Always update the layout barrier seqid on LAYOUTGET
  pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set
  pNFS: Clear the layout return tracking on layout reinitialisation
  pNFS: LAYOUTRETURN should only update the stateid if the layout is valid
  nfs: don't create zero-length requests
  ...
parents d761f3ed 944171cb
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o

CFLAGS_nfstrace.o += -I$(src)
nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
			   direct.o pagelist.o read.o symlink.o unlink.o \
			   io.o direct.o pagelist.o read.o symlink.o unlink.o \
			   write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
nfs-$(CONFIG_SYSCTL)	+= sysctl.o
+75 −35
Original line number Diff line number Diff line
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
		if (!p)
			return -EIO;
		b->simple.nr_sigs = be32_to_cpup(p++);
		if (!b->simple.nr_sigs) {
			dprintk("no signature\n");
		if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
			dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
			return -EIO;
		}

@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
			memcpy(&b->simple.sigs[i].sig, p,
				b->simple.sigs[i].sig_len);

			b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
			b->simple.len += 8 + 4 + \
				(XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
		}
		break;
	case PNFS_BLOCK_VOLUME_SLICE:
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
		p = xdr_inline_decode(xdr, 4);
		if (!p)
			return -EIO;

		b->concat.volumes_count = be32_to_cpup(p++);
		if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
			dprintk("Too many volumes: %d\n", b->concat.volumes_count);
			return -EIO;
		}

		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
		if (!p)
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
		p = xdr_inline_decode(xdr, 8 + 4);
		if (!p)
			return -EIO;

		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
		b->stripe.volumes_count = be32_to_cpup(p++);
		if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
			dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
			return -EIO;
		}

		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
		if (!p)
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
	struct pnfs_block_volume *v = &volumes[idx];
	struct block_device *bdev;
	dev_t dev;

	dev = bl_resolve_deviceid(server, v, gfp_mask);
	if (!dev)
		return -EIO;

	d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
	if (IS_ERR(d->bdev)) {
	bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
	if (IS_ERR(bdev)) {
		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
		return PTR_ERR(d->bdev);
			MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
		return PTR_ERR(bdev);
	}
	d->bdev = bdev;


	d->len = i_size_read(d->bdev->bd_inode);
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
	}
}

/*
 * Try to open the udev path for the WWN.  At least on Debian the udev
 * by-id path will always point to the dm-multipath device if one exists.
 */
static struct block_device *
bl_open_udev_path(struct pnfs_block_volume *v)
{
	struct block_device *bdev;
	const char *devname;

	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
				v->scsi.designator_len, v->scsi.designator);
	if (!devname)
		return ERR_PTR(-ENOMEM);

	bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
	if (IS_ERR(bdev)) {
		pr_warn("pNFS: failed to open device %s (%ld)\n",
			devname, PTR_ERR(bdev));
	}

	kfree(devname);
	return bdev;
}

/*
 * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
 * wwn- links will only point to the first discovered SCSI device there.
 */
static struct block_device *
bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
{
	struct block_device *bdev;
	const char *devname;

	devname = kasprintf(GFP_KERNEL,
			"/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
			v->scsi.designator_type,
			v->scsi.designator_len, v->scsi.designator);
	if (!devname)
		return ERR_PTR(-ENOMEM);

	bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
	kfree(devname);
	return bdev;
}

static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
	struct pnfs_block_volume *v = &volumes[idx];
	struct block_device *bdev;
	const struct pr_ops *ops;
	const char *devname;
	int error;

	if (!bl_validate_designator(v))
		return -EINVAL;

	switch (v->scsi.designator_len) {
	case 8:
		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
				v->scsi.designator);
		break;
	case 12:
		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
				v->scsi.designator);
		break;
	case 16:
		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
				v->scsi.designator);
		break;
	default:
		return -EINVAL;
	}

	d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
	if (IS_ERR(d->bdev)) {
		pr_warn("pNFS: failed to open device %s (%ld)\n",
			devname, PTR_ERR(d->bdev));
		kfree(devname);
		return PTR_ERR(d->bdev);
	}

	kfree(devname);
	bdev = bl_open_dm_mpath_udev_path(v);
	if (IS_ERR(bdev))
		bdev = bl_open_udev_path(v);
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
	d->bdev = bdev;

	d->len = i_size_read(d->bdev->bd_inode);
	d->map = bl_map_simple;
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
	return 0;

out_blkdev_put:
	blkdev_put(d->bdev, FMODE_READ);
	blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
	return error;
}

+21 −6
Original line number Diff line number Diff line
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
	return be;
}

static void __ext_put_deviceids(struct list_head *head)
{
	struct pnfs_block_extent *be, *tmp;

	list_for_each_entry_safe(be, tmp, head, be_list) {
		nfs4_put_deviceid_node(be->be_device);
		kfree(be);
	}
}

static void
__ext_tree_insert(struct rb_root *root,
		struct pnfs_block_extent *new, bool merge_ok)
@@ -163,7 +173,8 @@ __ext_tree_insert(struct rb_root *root,
}

static int
__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
__ext_tree_remove(struct rb_root *root,
		sector_t start, sector_t end, struct list_head *tmp)
{
	struct pnfs_block_extent *be;
	sector_t len1 = 0, len2 = 0;
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
			struct pnfs_block_extent *next = ext_tree_next(be);

			rb_erase(&be->be_node, root);
			nfs4_put_deviceid_node(be->be_device);
			kfree(be);
			list_add_tail(&be->be_list, tmp);
			be = next;
		}

@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
		sector_t start, sector_t end)
{
	int err, err2;
	LIST_HEAD(tmp);

	spin_lock(&bl->bl_ext_lock);
	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
	err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
	if (rw) {
		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
		if (!err)
			err = err2;
	}
	spin_unlock(&bl->bl_ext_lock);

	__ext_put_deviceids(&tmp);
	return err;
}

@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
	sector_t end = start + len;
	struct pnfs_block_extent *be;
	int err = 0;
	LIST_HEAD(tmp);

	spin_lock(&bl->bl_ext_lock);
	/*
	 * First remove all COW extents or holes from written to range.
	 */
	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
	err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
	if (err)
		goto out;

@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
	}
out:
	spin_unlock(&bl->bl_ext_lock);

	__ext_put_deviceids(&tmp);
	return err;
}

+44 −20
Original line number Diff line number Diff line
@@ -119,27 +119,30 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 * hashed by filehandle.
 */
static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
		struct nfs_fh *fh, nfs4_stateid *stateid)
		struct nfs_fh *fh)
{
	struct nfs_server *server;
	struct nfs_inode *nfsi;
	struct inode *ino;
	struct pnfs_layout_hdr *lo;

restart:
	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
		list_for_each_entry(lo, &server->layouts, plh_layouts) {
			if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
			nfsi = NFS_I(lo->plh_inode);
			if (nfs_compare_fh(fh, &nfsi->fh))
				continue;
			if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
			if (nfsi->layout != lo)
				continue;
			ino = igrab(lo->plh_inode);
			if (!ino)
				break;
			spin_lock(&ino->i_lock);
			/* Is this layout in the process of being freed? */
			if (NFS_I(ino)->layout != lo) {
			if (nfsi->layout != lo) {
				spin_unlock(&ino->i_lock);
				iput(ino);
				break;
				goto restart;
			}
			pnfs_get_layout_hdr(lo);
			spin_unlock(&ino->i_lock);
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
}

static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
		struct nfs_fh *fh, nfs4_stateid *stateid)
		struct nfs_fh *fh)
{
	struct pnfs_layout_hdr *lo;

	spin_lock(&clp->cl_lock);
	rcu_read_lock();
	lo = get_layout_by_fh_locked(clp, fh, stateid);
	lo = get_layout_by_fh_locked(clp, fh);
	rcu_read_unlock();
	spin_unlock(&clp->cl_lock);

@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
/*
 * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
 */
static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
					const nfs4_stateid *new)
{
	u32 oldseq, newseq;

	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
	/* Is the stateid still not initialised? */
	if (!pnfs_layout_is_valid(lo))
		return NFS4ERR_DELAY;

	/* Mismatched stateid? */
	if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
		return NFS4ERR_BAD_STATEID;

	newseq = be32_to_cpu(new->seqid);
	/* Are we already in a layout recall situation? */
	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
	    lo->plh_return_seq != 0) {
		if (newseq < lo->plh_return_seq)
			return NFS4ERR_OLD_STATEID;
		if (newseq > lo->plh_return_seq)
			return NFS4ERR_DELAY;
		goto out;
	}

	/* Check that the stateid matches what we think it should be. */
	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
	if (newseq > oldseq + 1)
		return false;
	return true;
		return NFS4ERR_DELAY;
	/* Crazy server! */
	if (newseq <= oldseq)
		return NFS4ERR_OLD_STATEID;
out:
	return NFS_OK;
}

static u32 initiate_file_draining(struct nfs_client *clp,
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
	LIST_HEAD(free_me_list);

	lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
	lo = get_layout_by_fh(clp, &args->cbl_fh);
	if (!lo) {
		trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
				&args->cbl_stateid, -rv);
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
	}

	ino = lo->plh_inode;
	pnfs_layoutcommit_inode(ino, false);


	spin_lock(&ino->i_lock);
	if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
		rv = NFS4ERR_DELAY;
	rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
	if (rv != NFS_OK)
		goto unlock;
	}
	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
	spin_unlock(&ino->i_lock);

	pnfs_layoutcommit_inode(ino, false);

	spin_lock(&ino->i_lock);
	/*
	 * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
	 */
@@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
		goto unlock;
	}

	/* Embrace your forgetfulness! */
	rv = NFS4ERR_NOMATCHING_LAYOUT;

	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
			&args->cbl_range);
	}
	pnfs_mark_layout_returned_if_empty(lo);
unlock:
	spin_unlock(&ino->i_lock);
	pnfs_free_lseg_list(&free_me_list);
+5 −1
Original line number Diff line number Diff line
@@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
	if (hdr_arg.minorversion == 0) {
		cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
			return rpc_drop_reply;
			goto out_invalidcred;
	}

	cps.minorversion = hdr_arg.minorversion;
@@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
	nfs_put_client(cps.clp);
	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
	return rpc_success;

out_invalidcred:
	pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
	return rpc_autherr_badcred;
}

/*
Loading