Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f99d4fbd authored by Christoph Hellwig's avatar Christoph Hellwig Committed by J. Bruce Fields
Browse files

nfsd: add SCSI layout support



This is a simple extension to the block layout driver to use SCSI
persistent reservations for access control and fencing, as well as
SCSI VPD pages for device identification.

For this we need to pass the nfs4_client to the proc_getdeviceinfo method
to generate the reservation key, and add a new fence_client method
to allow for fence actions in the layout driver.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarJ. Bruce Fields <bfields@redhat.com>
parent 368248ee
Loading
Loading
Loading
Loading
+23 −0
Original line number Diff line number Diff line

pNFS SCSI layout server user guide
==================================

This document describes support for pNFS SCSI layouts in the Linux NFS server.
With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
which in addition to handling all the metadata access to the NFS export,
also hands out layouts to the clients so that they can directly access the
underlying SCSI LUNs that are shared with the client.

To use pNFS SCSI layouts with with the Linux NFS server, the exported file
system needs to support the pNFS SCSI layouts (currently just XFS), and the
file system must sit on a SCSI LUN that is accessible to the clients in
addition to the MDS.  As of now the file system needs to sit directly on the
exported LUN, striping or concatenation of LUNs on the MDS and clients
is not supported yet.

On a server built with CONFIG_NFSD_SCSI, the pNFS SCSI volume support is
automatically enabled if the file system is exported using the "pnfs"
option and the underlying SCSI device support persistent reservations.
On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
enabled, and the file system is mounted using the NFSv4.1 protocol
version (mount -o vers=4.1).
+13 −0
Original line number Diff line number Diff line
@@ -98,6 +98,19 @@ config NFSD_BLOCKLAYOUT

	  If unsure, say N.

config NFSD_SCSILAYOUT
	bool "NFSv4.1 server support for pNFS SCSI layouts"
	depends on NFSD_V4
	select NFSD_PNFS
	help
	  This option enables support for the exporting pNFS SCSI layouts
	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
	  clients to directly perform I/O to SCSI devices accesible to both
	  the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
	  more details.

	  If unsure, say N.

config NFSD_V4_SECURITY_LABEL
	bool "Provide Security Label support for NFSv4 server"
	depends on NFSD_V4 && SECURITY
+1 −0
Original line number Diff line number Diff line
@@ -19,3 +19,4 @@ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
			   nfs4acl.o nfs4callback.o nfs4recover.o
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
+207 −1
Original line number Diff line number Diff line
/*
 * Copyright (c) 2014 Christoph Hellwig.
 * Copyright (c) 2014-2016 Christoph Hellwig.
 */
#include <linux/exportfs.h>
#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>

#include <linux/nfsd/debug.h>
#include <scsi/scsi_proto.h>
#include <scsi/scsi_common.h>

#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -159,6 +162,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb,

static __be32
nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
		struct nfs4_client *clp,
		struct nfsd4_getdeviceinfo *gdp)
{
	if (sb->s_bdev != sb->s_bdev->bd_contains)
@@ -200,3 +204,205 @@ const struct nfsd4_layout_ops bl_layout_ops = {
	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
};
#endif /* CONFIG_NFSD_BLOCKLAYOUT */

#ifdef CONFIG_NFSD_SCSILAYOUT
static int nfsd4_scsi_identify_device(struct block_device *bdev,
		struct pnfs_block_volume *b)
{
	struct request_queue *q = bdev->bd_disk->queue;
	struct request *rq;
	size_t bufflen = 252, len, id_len;
	u8 *buf, *d, type, assoc;
	int error;

	buf = kzalloc(bufflen, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;

	rq = blk_get_request(q, READ, GFP_KERNEL);
	if (IS_ERR(rq)) {
		error = -ENOMEM;
		goto out_free_buf;
	}
	blk_rq_set_block_pc(rq);

	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
	if (error)
		goto out_put_request;

	rq->cmd[0] = INQUIRY;
	rq->cmd[1] = 1;
	rq->cmd[2] = 0x83;
	rq->cmd[3] = bufflen >> 8;
	rq->cmd[4] = bufflen & 0xff;
	rq->cmd_len = COMMAND_SIZE(INQUIRY);

	error = blk_execute_rq(rq->q, NULL, rq, 1);
	if (error) {
		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
			rq->errors);
		goto out_put_request;
	}

	len = (buf[2] << 8) + buf[3] + 4;
	if (len > bufflen) {
		pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
			len);
		goto out_put_request;
	}

	d = buf + 4;
	for (d = buf + 4; d < buf + len; d += id_len + 4) {
		id_len = d[3];
		type = d[1] & 0xf;
		assoc = (d[1] >> 4) & 0x3;

		/*
		 * We only care about a EUI-64 and NAA designator types
		 * with LU association.
		 */
		if (assoc != 0x00)
			continue;
		if (type != 0x02 && type != 0x03)
			continue;
		if (id_len != 8 && id_len != 12 && id_len != 16)
			continue;

		b->scsi.code_set = PS_CODE_SET_BINARY;
		b->scsi.designator_type = type == 0x02 ?
			PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
		b->scsi.designator_len = id_len;
		memcpy(b->scsi.designator, d + 4, id_len);

		/*
		 * If we found a 8 or 12 byte descriptor continue on to
		 * see if a 16 byte one is available.  If we find a
		 * 16 byte descriptor we're done.
		 */
		if (id_len == 16)
			break;
	}

out_put_request:
	blk_put_request(rq);
out_free_buf:
	kfree(buf);
	return error;
}

#define NFSD_MDS_PR_KEY		0x0100000000000000

/*
 * We use the client ID as a unique key for the reservations.
 * This allows us to easily fence a client when recalls fail.
 */
static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
{
	return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
}

static int
nfsd4_block_get_device_info_scsi(struct super_block *sb,
		struct nfs4_client *clp,
		struct nfsd4_getdeviceinfo *gdp)
{
	struct pnfs_block_deviceaddr *dev;
	struct pnfs_block_volume *b;
	const struct pr_ops *ops;
	int error;

	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
	if (!dev)
		return -ENOMEM;
	gdp->gd_device = dev;

	dev->nr_volumes = 1;
	b = &dev->volumes[0];

	b->type = PNFS_BLOCK_VOLUME_SCSI;
	b->scsi.pr_key = nfsd4_scsi_pr_key(clp);

	error = nfsd4_scsi_identify_device(sb->s_bdev, b);
	if (error)
		return error;

	ops = sb->s_bdev->bd_disk->fops->pr_ops;
	if (!ops) {
		pr_err("pNFS: device %s does not support PRs.\n",
			sb->s_id);
		return -EINVAL;
	}

	error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
	if (error) {
		pr_err("pNFS: failed to register key for device %s.\n",
			sb->s_id);
		return -EINVAL;
	}

	error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
			PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
	if (error) {
		pr_err("pNFS: failed to reserve device %s.\n",
			sb->s_id);
		return -EINVAL;
	}

	return 0;
}

static __be32
nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
		struct nfs4_client *clp,
		struct nfsd4_getdeviceinfo *gdp)
{
	if (sb->s_bdev != sb->s_bdev->bd_contains)
		return nfserr_inval;
	return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
}
static __be32
nfsd4_scsi_proc_layoutcommit(struct inode *inode,
		struct nfsd4_layoutcommit *lcp)
{
	struct iomap *iomaps;
	int nr_iomaps;

	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
	if (nr_iomaps < 0)
		return nfserrno(nr_iomaps);

	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}

static void
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
{
	struct nfs4_client *clp = ls->ls_stid.sc_client;
	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;

	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
			nfsd4_scsi_pr_key(clp), 0, true);
}

const struct nfsd4_layout_ops scsi_layout_ops = {
	/*
	 * Pretend that we send notification to the client.  This is a blatant
	 * lie to force recent Linux clients to cache our device IDs.
	 * We rarely ever change the device ID, so the harm of leaking deviceids
	 * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
	 * in this regard, but I filed errata 4119 for this a while ago, and
	 * hopefully the Linux client will eventually start caching deviceids
	 * without this again.
	 */
	.notify_types		=
			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
	.proc_getdeviceinfo	= nfsd4_scsi_proc_getdeviceinfo,
	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
	.proc_layoutget		= nfsd4_block_proc_layoutget,
	.encode_layoutget	= nfsd4_block_encode_layoutget,
	.proc_layoutcommit	= nfsd4_scsi_proc_layoutcommit,
	.fence_client		= nfsd4_scsi_fence_client,
};
#endif /* CONFIG_NFSD_SCSILAYOUT */
+64 −1
Original line number Diff line number Diff line
/*
 * Copyright (c) 2014 Christoph Hellwig.
 * Copyright (c) 2014-2016 Christoph Hellwig.
 */
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
		p = xdr_encode_hyper(p, b->simple.offset);
		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
		break;
	case PNFS_BLOCK_VOLUME_SCSI:
		len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
		p = xdr_reserve_space(xdr, len);
		if (!p)
			return -ETOOSMALL;

		*p++ = cpu_to_be32(b->type);
		*p++ = cpu_to_be32(b->scsi.code_set);
		*p++ = cpu_to_be32(b->scsi.designator_type);
		p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
		p = xdr_encode_hyper(p, b->scsi.pr_key);
		break;
	default:
		return -ENOTSUPP;
	}
@@ -155,3 +167,54 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
	kfree(iomaps);
	return -EINVAL;
}

int
nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
		u32 block_size)
{
	struct iomap *iomaps;
	u32 nr_iomaps, expected, i;

	if (len < sizeof(u32)) {
		dprintk("%s: extent array too small: %u\n", __func__, len);
		return -EINVAL;
	}

	nr_iomaps = be32_to_cpup(p++);
	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
	if (len != expected) {
		dprintk("%s: extent array size mismatch: %u/%u\n",
			__func__, len, expected);
		return -EINVAL;
	}

	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
	if (!iomaps) {
		dprintk("%s: failed to allocate extent array\n", __func__);
		return -ENOMEM;
	}

	for (i = 0; i < nr_iomaps; i++) {
		u64 val;

		p = xdr_decode_hyper(p, &val);
		if (val & (block_size - 1)) {
			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
			goto fail;
		}
		iomaps[i].offset = val;

		p = xdr_decode_hyper(p, &val);
		if (val & (block_size - 1)) {
			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
			goto fail;
		}
		iomaps[i].length = val;
	}

	*iomapp = iomaps;
	return nr_iomaps;
fail:
	kfree(iomaps);
	return -EINVAL;
}
Loading