Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 59d3e8be authored by Alexey Skidanov's avatar Alexey Skidanov Committed by Oded Gabbay
Browse files

drm/amdkfd: Add memory exception handling



This patch adds Peripheral Page Request (PPR) failure processing
and reporting.

Bad address or pointer to a system memory block with inappropriate
read/write permission cause such PPR failure during a user queue
processing. PPR request handling is done by IOMMU driver notifying
AMDKFD module on PPR failure.

The process triggering a PPR failure will be notified by
appropriate event or SIGTERM signal will be sent to it.

v3:
- Change all bool fields in struct kfd_memory_exception_failure to
  uint32_t

Signed-off-by: default avatarAlexey Skidanov <alexey.skidanov@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent f3a39818
Loading
Loading
Loading
Loading
+29 −0
Original line number Diff line number Diff line
@@ -182,6 +182,32 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
		kfd_unbind_process_from_device(dev, pasid);
}

/*
 * This function called by IOMMU driver on PPR failure
 */
static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
		unsigned long address, u16 flags)
{
	struct kfd_dev *dev;

	dev_warn(kfd_device,
			"Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
			PCI_BUS_NUM(pdev->devfn),
			PCI_SLOT(pdev->devfn),
			PCI_FUNC(pdev->devfn),
			pasid,
			address,
			flags);

	dev = kfd_device_by_pci_dev(pdev);
	BUG_ON(dev == NULL);

	kfd_signal_iommu_event(dev, pasid, address,
			flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);

	return AMD_IOMMU_INV_PRI_RSP_INVALID;
}

bool kgd2kfd_device_init(struct kfd_dev *kfd,
			 const struct kgd2kfd_shared_resources *gpu_resources)
{
@@ -251,6 +277,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
	}
	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
						iommu_pasid_shutdown_callback);
	amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);

	kfd->dqm = device_queue_manager_init(kfd);
	if (!kfd->dqm) {
@@ -316,6 +343,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
	if (kfd->init_complete) {
		kfd->dqm->ops.stop(kfd->dqm);
		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
		amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
		amd_iommu_free_device(kfd->pdev);
	}
}
@@ -335,6 +363,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
			return -ENXIO;
		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
						iommu_pasid_shutdown_callback);
		amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
		kfd->dqm->ops.start(kfd->dqm);
	}

+147 −8
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#include <linux/memory.h>
#include "kfd_priv.h"
#include "kfd_events.h"
#include <linux/device.h>

/*
 * A task can only be on a single wait_queue at a time, but we need to support
@@ -45,6 +46,10 @@ struct kfd_event_waiter {

	/* Transitions to true when the event this belongs to is signaled. */
	bool activated;

	/* Event */
	struct kfd_event *event;
	uint32_t input_index;
};

/*
@@ -610,13 +615,16 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)

static int init_event_waiter(struct kfd_process *p,
		struct kfd_event_waiter *waiter,
				uint32_t event_id)
		uint32_t event_id,
		uint32_t input_index)
{
	struct kfd_event *ev = lookup_event_by_id(p, event_id);

	if (!ev)
		return -EINVAL;

	waiter->event = ev;
	waiter->input_index = input_index;
	waiter->activated = ev->signaled;
	ev->signaled = ev->signaled && !ev->auto_reset;

@@ -643,6 +651,38 @@ static bool test_event_condition(bool all, uint32_t num_events,
	return activated_count == num_events;
}

/*
 * Copy event specific data, if defined.
 * Currently only memory exception events have additional data to copy to user
 */
static bool copy_signaled_event_data(uint32_t num_events,
		struct kfd_event_waiter *event_waiters,
		struct kfd_event_data __user *data)
{
	struct kfd_hsa_memory_exception_data *src;
	struct kfd_hsa_memory_exception_data __user *dst;
	struct kfd_event_waiter *waiter;
	struct kfd_event *event;
	uint32_t i;

	for (i = 0; i < num_events; i++) {
		waiter = &event_waiters[i];
		event = waiter->event;
		if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
			dst = &data[waiter->input_index].memory_exception_data;
			src = &event->memory_exception_data;
			if (copy_to_user(dst, src,
				sizeof(struct kfd_hsa_memory_exception_data)))
				return false;
		}
	}

	return true;

}



static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
{
	if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
@@ -672,10 +712,12 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
}

int kfd_wait_on_events(struct kfd_process *p,
		       uint32_t num_events, const uint32_t __user *event_ids,
		       uint32_t num_events, void __user *data,
		       bool all, uint32_t user_timeout_ms,
		       enum kfd_event_wait_result *wait_result)
{
	struct kfd_event_data __user *events =
			(struct kfd_event_data __user *) data;
	uint32_t i;
	int ret = 0;
	struct kfd_event_waiter *event_waiters = NULL;
@@ -690,13 +732,14 @@ int kfd_wait_on_events(struct kfd_process *p,
	}

	for (i = 0; i < num_events; i++) {
		uint32_t event_id;
		struct kfd_event_data event_data;

		ret = get_user(event_id, &event_ids[i]);
		if (ret)
		if (copy_from_user(&event_data, &events[i],
				sizeof(struct kfd_event_data)))
			goto fail;

		ret = init_event_waiter(p, &event_waiters[i], event_id);
		ret = init_event_waiter(p, &event_waiters[i],
				event_data.event_id, i);
		if (ret)
			goto fail;
	}
@@ -723,7 +766,11 @@ int kfd_wait_on_events(struct kfd_process *p,
		}

		if (test_event_condition(all, num_events, event_waiters)) {
			if (copy_signaled_event_data(num_events,
					event_waiters, events))
				*wait_result = KFD_WAIT_COMPLETE;
			else
				*wait_result = KFD_WAIT_ERROR;
			break;
		}

@@ -797,3 +844,95 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
	return remap_pfn_range(vma, vma->vm_start, pfn,
			vma->vm_end - vma->vm_start, vma->vm_page_prot);
}

/*
 * Assumes that p->event_mutex is held and of course
 * that p is not going away (current or locked).
 */
static void lookup_events_by_type_and_signal(struct kfd_process *p,
		int type, void *event_data)
{
	struct kfd_hsa_memory_exception_data *ev_data;
	struct kfd_event *ev;
	int bkt;
	bool send_signal = true;

	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;

	hash_for_each(p->events, bkt, ev, events)
		if (ev->type == type) {
			send_signal = false;
			dev_dbg(kfd_device,
					"Event found: id %X type %d",
					ev->event_id, ev->type);
			set_event(ev);
			if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
				ev->memory_exception_data = *ev_data;
		}

	/* Send SIGTERM no event of type "type" has been found*/
	if (send_signal) {
		dev_warn(kfd_device,
			"Sending SIGTERM to HSA Process with PID %d ",
				p->lead_thread->pid);
		send_sig(SIGTERM, p->lead_thread, 0);
	}
}

void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
		unsigned long address, bool is_write_requested,
		bool is_execute_requested)
{
	struct kfd_hsa_memory_exception_data memory_exception_data;
	struct vm_area_struct *vma;

	/*
	 * Because we are called from arbitrary context (workqueue) as opposed
	 * to process context, kfd_process could attempt to exit while we are
	 * running so the lookup function returns a locked process.
	 */
	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

	if (!p)
		return; /* Presumably process exited. */

	memset(&memory_exception_data, 0, sizeof(memory_exception_data));

	down_read(&p->mm->mmap_sem);
	vma = find_vma(p->mm, address);

	memory_exception_data.gpu_id = dev->id;
	memory_exception_data.va = address;
	/* Set failure reason */
	memory_exception_data.failure.NotPresent = 1;
	memory_exception_data.failure.NoExecute = 0;
	memory_exception_data.failure.ReadOnly = 0;
	if (vma) {
		if (vma->vm_start > address) {
			memory_exception_data.failure.NotPresent = 1;
			memory_exception_data.failure.NoExecute = 0;
			memory_exception_data.failure.ReadOnly = 0;
		} else {
			memory_exception_data.failure.NotPresent = 0;
			if (is_write_requested && !(vma->vm_flags & VM_WRITE))
				memory_exception_data.failure.ReadOnly = 1;
			else
				memory_exception_data.failure.ReadOnly = 0;
			if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
				memory_exception_data.failure.NoExecute = 1;
			else
				memory_exception_data.failure.NoExecute = 0;
		}
	}

	up_read(&p->mm->mmap_sem);

	mutex_lock(&p->event_mutex);

	/* Lookup events by type and signal them */
	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
			&memory_exception_data);

	mutex_unlock(&p->event_mutex);
	mutex_unlock(&p->mutex);
}
+7 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include "kfd_priv.h"
#include <uapi/linux/kfd_ioctl.h>

#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
@@ -61,6 +62,11 @@ struct kfd_event {
	struct signal_page *signal_page;
	unsigned int signal_slot_index;
	uint64_t __user *user_signal_address;

	/* type specific data */
	union {
		struct kfd_hsa_memory_exception_data memory_exception_data;
	};
};

#define KFD_EVENT_TIMEOUT_IMMEDIATE 0
@@ -69,6 +75,7 @@ struct kfd_event {
/* Matching HSA_EVENTTYPE */
#define KFD_EVENT_TYPE_SIGNAL 0
#define KFD_EVENT_TYPE_DEBUG 5
#define KFD_EVENT_TYPE_MEMORY 8

extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
					uint32_t valid_id_bits);
+4 −1
Original line number Diff line number Diff line
@@ -693,11 +693,14 @@ void kfd_event_init_process(struct kfd_process *p);
void kfd_event_free_process(struct kfd_process *p);
int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
int kfd_wait_on_events(struct kfd_process *p,
		       uint32_t num_events, const uint32_t __user *event_ids,
		       uint32_t num_events, void __user *data,
		       bool all, uint32_t user_timeout_ms,
		       enum kfd_event_wait_result *wait_result);
void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
				uint32_t valid_id_bits);
void kfd_signal_iommu_event(struct kfd_dev *dev,
		unsigned int pasid, unsigned long address,
		bool is_write_requested, bool is_execute_requested);
int kfd_set_event(struct kfd_process *p, uint32_t event_id);
int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
int kfd_event_create(struct file *devkfd, struct kfd_process *p,