Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d75671e3 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'vfio-v3.12-rc0' of git://github.com/awilliam/linux-vfio

Pull VFIO update from Alex Williamson:
 "VFIO updates include safer default file flags for VFIO device fds, an
  external user interface exported to allow other modules to hold
  references to VFIO groups, a fix to test for extended config space on
  PCIe and PCI-x, and new hot reset interfaces for PCI devices which
  allows the user to do PCI bus/slot resets when all of the devices
  affected by the reset are owned by the user.

  For this last feature, the PCI bus reset interface, I depend on
  changes already merged from Bjorn's PCI pull request.  I therefore
  merged my tree up to commit cb3e4330, which I think was the correct
  action, but as Stephen Rothwell noted, I failed to provide a commit
  message indicating why the merge was required.  Sorry for that.
  Thanks, Alex"

* tag 'vfio-v3.12-rc0' of git://github.com/awilliam/linux-vfio:
  vfio: fix documentation
  vfio-pci: PCI hot reset interface
  vfio-pci: Test for extended config space
  vfio-pci: Use fdget() rather than eventfd_fget()
  vfio: Add O_CLOEXEC flag to vfio device fd
  vfio: use get_unused_fd_flags(0) instead of get_unused_fd()
  vfio: add external user support
parents bf97293e dac09b57
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -167,8 +167,8 @@ group and can access them as follows:
	int container, group, device, i;
	struct vfio_group_status group_status =
					{ .argsz = sizeof(group_status) };
	struct vfio_iommu_x86_info iommu_info = { .argsz = sizeof(iommu_info) };
	struct vfio_iommu_x86_dma_map dma_map = { .argsz = sizeof(dma_map) };
	struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
	struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };

	/* Create a new container */
@@ -193,7 +193,7 @@ group and can access them as follows:
	ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);

	/* Enable the IOMMU model we want */
	ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)
	ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);

	/* Get addition IOMMU info */
	ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
@@ -229,7 +229,7 @@ group and can access them as follows:

		irq.index = i;

		ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &reg);
		ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq);

		/* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */
	}
+285 −1
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@

#include <linux/device.h>
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/module.h>
@@ -227,6 +228,110 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
	return 0;
}

static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
{
	(*(int *)data)++;
	return 0;
}

struct vfio_pci_fill_info {
	int max;
	int cur;
	struct vfio_pci_dependent_device *devices;
};

static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
	struct vfio_pci_fill_info *fill = data;
	struct iommu_group *iommu_group;

	if (fill->cur == fill->max)
		return -EAGAIN; /* Something changed, try again */

	iommu_group = iommu_group_get(&pdev->dev);
	if (!iommu_group)
		return -EPERM; /* Cannot reset non-isolated devices */

	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
	fill->devices[fill->cur].bus = pdev->bus->number;
	fill->devices[fill->cur].devfn = pdev->devfn;
	fill->cur++;
	iommu_group_put(iommu_group);
	return 0;
}

struct vfio_pci_group_entry {
	struct vfio_group *group;
	int id;
};

struct vfio_pci_group_info {
	int count;
	struct vfio_pci_group_entry *groups;
};

static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
{
	struct vfio_pci_group_info *info = data;
	struct iommu_group *group;
	int id, i;

	group = iommu_group_get(&pdev->dev);
	if (!group)
		return -EPERM;

	id = iommu_group_id(group);

	for (i = 0; i < info->count; i++)
		if (info->groups[i].id == id)
			break;

	iommu_group_put(group);

	return (i == info->count) ? -EINVAL : 0;
}

static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
{
	for (; pdev; pdev = pdev->bus->self)
		if (pdev->bus == slot->bus)
			return (pdev->slot == slot);
	return false;
}

struct vfio_pci_walk_info {
	int (*fn)(struct pci_dev *, void *data);
	void *data;
	struct pci_dev *pdev;
	bool slot;
	int ret;
};

static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
{
	struct vfio_pci_walk_info *walk = data;

	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
		walk->ret = walk->fn(pdev, walk->data);

	return walk->ret;
}

static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
					 int (*fn)(struct pci_dev *,
						   void *data), void *data,
					 bool slot)
{
	struct vfio_pci_walk_info walk = {
		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
	};

	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);

	return walk.ret;
}

static long vfio_pci_ioctl(void *device_data,
			   unsigned int cmd, unsigned long arg)
{
@@ -407,10 +512,189 @@ static long vfio_pci_ioctl(void *device_data,

		return ret;

	} else if (cmd == VFIO_DEVICE_RESET)
	} else if (cmd == VFIO_DEVICE_RESET) {
		return vdev->reset_works ?
			pci_reset_function(vdev->pdev) : -EINVAL;

	} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
		struct vfio_pci_hot_reset_info hdr;
		struct vfio_pci_fill_info fill = { 0 };
		struct vfio_pci_dependent_device *devices = NULL;
		bool slot = false;
		int ret = 0;

		minsz = offsetofend(struct vfio_pci_hot_reset_info, count);

		if (copy_from_user(&hdr, (void __user *)arg, minsz))
			return -EFAULT;

		if (hdr.argsz < minsz)
			return -EINVAL;

		hdr.flags = 0;

		/* Can we do a slot or bus reset or neither? */
		if (!pci_probe_reset_slot(vdev->pdev->slot))
			slot = true;
		else if (pci_probe_reset_bus(vdev->pdev->bus))
			return -ENODEV;

		/* How many devices are affected? */
		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
						    vfio_pci_count_devs,
						    &fill.max, slot);
		if (ret)
			return ret;

		WARN_ON(!fill.max); /* Should always be at least one */

		/*
		 * If there's enough space, fill it now, otherwise return
		 * -ENOSPC and the number of devices affected.
		 */
		if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
			ret = -ENOSPC;
			hdr.count = fill.max;
			goto reset_info_exit;
		}

		devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
		if (!devices)
			return -ENOMEM;

		fill.devices = devices;

		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
						    vfio_pci_fill_devs,
						    &fill, slot);

		/*
		 * If a device was removed between counting and filling,
		 * we may come up short of fill.max.  If a device was
		 * added, we'll have a return of -EAGAIN above.
		 */
		if (!ret)
			hdr.count = fill.cur;

reset_info_exit:
		if (copy_to_user((void __user *)arg, &hdr, minsz))
			ret = -EFAULT;

		if (!ret) {
			if (copy_to_user((void __user *)(arg + minsz), devices,
					 hdr.count * sizeof(*devices)))
				ret = -EFAULT;
		}

		kfree(devices);
		return ret;

	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
		struct vfio_pci_hot_reset hdr;
		int32_t *group_fds;
		struct vfio_pci_group_entry *groups;
		struct vfio_pci_group_info info;
		bool slot = false;
		int i, count = 0, ret = 0;

		minsz = offsetofend(struct vfio_pci_hot_reset, count);

		if (copy_from_user(&hdr, (void __user *)arg, minsz))
			return -EFAULT;

		if (hdr.argsz < minsz || hdr.flags)
			return -EINVAL;

		/* Can we do a slot or bus reset or neither? */
		if (!pci_probe_reset_slot(vdev->pdev->slot))
			slot = true;
		else if (pci_probe_reset_bus(vdev->pdev->bus))
			return -ENODEV;

		/*
		 * We can't let userspace give us an arbitrarily large
		 * buffer to copy, so verify how many we think there
		 * could be.  Note groups can have multiple devices so
		 * one group per device is the max.
		 */
		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
						    vfio_pci_count_devs,
						    &count, slot);
		if (ret)
			return ret;

		/* Somewhere between 1 and count is OK */
		if (!hdr.count || hdr.count > count)
			return -EINVAL;

		group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
		groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
		if (!group_fds || !groups) {
			kfree(group_fds);
			kfree(groups);
			return -ENOMEM;
		}

		if (copy_from_user(group_fds, (void __user *)(arg + minsz),
				   hdr.count * sizeof(*group_fds))) {
			kfree(group_fds);
			kfree(groups);
			return -EFAULT;
		}

		/*
		 * For each group_fd, get the group through the vfio external
		 * user interface and store the group and iommu ID.  This
		 * ensures the group is held across the reset.
		 */
		for (i = 0; i < hdr.count; i++) {
			struct vfio_group *group;
			struct fd f = fdget(group_fds[i]);
			if (!f.file) {
				ret = -EBADF;
				break;
			}

			group = vfio_group_get_external_user(f.file);
			fdput(f);
			if (IS_ERR(group)) {
				ret = PTR_ERR(group);
				break;
			}

			groups[i].group = group;
			groups[i].id = vfio_external_user_iommu_id(group);
		}

		kfree(group_fds);

		/* release reference to groups on error */
		if (ret)
			goto hot_reset_release;

		info.count = hdr.count;
		info.groups = groups;

		/*
		 * Test whether all the affected devices are contained
		 * by the set of groups provided by the user.
		 */
		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
						    vfio_pci_validate_devs,
						    &info, slot);
		if (!ret)
			/* User has access, do the reset */
			ret = slot ? pci_reset_slot(vdev->pdev->slot) :
				     pci_reset_bus(vdev->pdev->bus);

hot_reset_release:
		for (i--; i >= 0; i--)
			vfio_group_put_external_user(groups[i].group);

		kfree(groups);
		return ret;
	}

	return -ENOTTY;
}

+8 −3
Original line number Diff line number Diff line
@@ -1012,6 +1012,7 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
{
	struct pci_dev *pdev = vdev->pdev;
	u32 dword;
	u16 word;
	u8 byte;
	int ret;
@@ -1025,7 +1026,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
			return pcibios_err_to_errno(ret);

		if (PCI_X_CMD_VERSION(word)) {
			vdev->extended_caps = true;
			/* Test for extended capabilities */
			pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
			vdev->extended_caps = (dword != 0);
			return PCI_CAP_PCIX_SIZEOF_V2;
		} else
			return PCI_CAP_PCIX_SIZEOF_V0;
@@ -1037,9 +1040,11 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)

		return byte;
	case PCI_CAP_ID_EXP:
		/* length based on version */
		vdev->extended_caps = true;
		/* Test for extended capabilities */
		pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
		vdev->extended_caps = (dword != 0);

		/* length based on version */
		if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
			return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
		else
+16 −19
Original line number Diff line number Diff line
@@ -130,8 +130,8 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
			 void (*thread)(struct vfio_pci_device *, void *),
			 void *data, struct virqfd **pvirqfd, int fd)
{
	struct file *file = NULL;
	struct eventfd_ctx *ctx = NULL;
	struct fd irqfd;
	struct eventfd_ctx *ctx;
	struct virqfd *virqfd;
	int ret = 0;
	unsigned int events;
@@ -149,16 +149,16 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
	INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
	INIT_WORK(&virqfd->inject, virqfd_inject);

	file = eventfd_fget(fd);
	if (IS_ERR(file)) {
		ret = PTR_ERR(file);
		goto fail;
	irqfd = fdget(fd);
	if (!irqfd.file) {
		ret = -EBADF;
		goto err_fd;
	}

	ctx = eventfd_ctx_fileget(file);
	ctx = eventfd_ctx_fileget(irqfd.file);
	if (IS_ERR(ctx)) {
		ret = PTR_ERR(ctx);
		goto fail;
		goto err_ctx;
	}

	virqfd->eventfd = ctx;
@@ -174,7 +174,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
	if (*pvirqfd) {
		spin_unlock_irq(&vdev->irqlock);
		ret = -EBUSY;
		goto fail;
		goto err_busy;
	}
	*pvirqfd = virqfd;

@@ -187,7 +187,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);

	events = file->f_op->poll(file, &virqfd->pt);
	events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);

	/*
	 * Check if there was an event already pending on the eventfd
@@ -202,17 +202,14 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
	 * Do not drop the file until the irqfd is fully initialized,
	 * otherwise we might race against the POLLHUP.
	 */
	fput(file);
	fdput(irqfd);

	return 0;

fail:
	if (ctx && !IS_ERR(ctx))
err_busy:
	eventfd_ctx_put(ctx);

	if (file && !IS_ERR(file))
		fput(file);

err_ctx:
	fdput(irqfd);
err_fd:
	kfree(virqfd);

	return ret;
+63 −1
Original line number Diff line number Diff line
@@ -1109,7 +1109,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
		 * We can't use anon_inode_getfd() because we need to modify
		 * the f_mode flags directly to allow more than just ioctls
		 */
		ret = get_unused_fd();
		ret = get_unused_fd_flags(O_CLOEXEC);
		if (ret < 0) {
			device->ops->release(device->device_data);
			break;
@@ -1352,6 +1352,68 @@ static const struct file_operations vfio_device_fops = {
	.mmap		= vfio_device_fops_mmap,
};

/**
 * External user API, exported by symbols to be linked dynamically.
 *
 * The protocol includes:
 *  1. do normal VFIO init operation:
 *	- opening a new container;
 *	- attaching group(s) to it;
 *	- setting an IOMMU driver for a container.
 * When IOMMU is set for a container, all groups in it are
 * considered ready to use by an external user.
 *
 * 2. User space passes a group fd to an external user.
 * The external user calls vfio_group_get_external_user()
 * to verify that:
 *	- the group is initialized;
 *	- IOMMU is set for it.
 * If both checks passed, vfio_group_get_external_user()
 * increments the container user counter to prevent
 * the VFIO group from disposal before KVM exits.
 *
 * 3. The external user calls vfio_external_user_iommu_id()
 * to know an IOMMU ID.
 *
 * 4. When the external KVM finishes, it calls
 * vfio_group_put_external_user() to release the VFIO group.
 * This call decrements the container user counter.
 */
struct vfio_group *vfio_group_get_external_user(struct file *filep)
{
	struct vfio_group *group = filep->private_data;

	if (filep->f_op != &vfio_group_fops)
		return ERR_PTR(-EINVAL);

	if (!atomic_inc_not_zero(&group->container_users))
		return ERR_PTR(-EINVAL);

	if (!group->container->iommu_driver ||
			!vfio_group_viable(group)) {
		atomic_dec(&group->container_users);
		return ERR_PTR(-EINVAL);
	}

	vfio_group_get(group);

	return group;
}
EXPORT_SYMBOL_GPL(vfio_group_get_external_user);

void vfio_group_put_external_user(struct vfio_group *group)
{
	vfio_group_put(group);
	vfio_group_try_dissolve_container(group);
}
EXPORT_SYMBOL_GPL(vfio_group_put_external_user);

int vfio_external_user_iommu_id(struct vfio_group *group)
{
	return iommu_group_id(group->iommu_group);
}
EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);

/**
 * Module/class support
 */
Loading